├── ACMR
    ├── README.md
    ├── model
    │   ├── adv_crossmodal_simple_nuswide.py
    │   ├── adv_crossmodal_simple_wiki.py
    │   ├── adv_crossmodal_triplet_wiki.py
    │   ├── base_model.py
    │   └── flip_gradient.py
    ├── train_adv_crossmodal_simple_nuswide.py
    ├── train_adv_crossmodal_simple_wiki.py
    └── train_adv_crossmodal_triplet_wiki.py
├── AFM
    ├── AFM.py
    ├── FM.py
    ├── LoadData.py
    └── README.md
├── BERT
    ├── README.md
    └── modeling_bert.py
├── BPR
    ├── BPR.py
    └── README.md
├── C3D
    ├── C3D_model.py
    ├── README.md
    └── predict.py
├── CLIP
    ├── README.md
    └── model.py
├── CTRL
    ├── README.md
    └── TALL.py
├── Capsules
    ├── CapsNet.py
    └── README.md
├── DCGAN
    ├── README.md
    ├── dcgan_keras.py
    └── dcgan_pytorch.py
├── DIN
    ├── Dice.py
    ├── README.md
    ├── build_dataset.py
    ├── input.py
    ├── model.py
    └── train.py
├── DRL-REC
    ├── README.md
    ├── build_estimator.py
    ├── pre_process_data.py
    ├── replay_buffer.py
    └── simulator.py
├── DeepWalk
    ├── README.md
    ├── classify.py
    ├── deepwalk.py
    └── deepwalk_wiki.py
├── GAE
    ├── README.md
    └── pytorch_geometric_gae.py
├── GAT
    ├── README.md
    ├── layers.py
    ├── models.py
    ├── train.py
    └── utils.py
├── GCN
    ├── README.md
    ├── keras_graph.py
    ├── keras_setup.py
    ├── keras_train.py
    ├── keras_utils.py
    └── pytorch_geometric_gcn.py
├── Graph-Transformer
    ├── README.md
    └── graph_transformer.py
├── GraphSAGE
    ├── README.md
    ├── aggregators.py
    ├── encoders.py
    └── model.py
├── HetGNN
    ├── DeepWalk.py
    ├── HetGNN.py
    ├── README.md
    └── tools.py
├── IRGAN
    ├── README.md
    ├── cf_dns.py
    ├── cf_gan.py
    ├── dis_model.py
    ├── dis_model_dns.py
    ├── gen_model.py
    └── utils.py
├── InfoGAN
    ├── README.md
    ├── main.py
    ├── model.py
    └── trainer.py
├── LightGCN
    ├── LightGCN.py
    └── README.md
├── Louvain
    ├── Louvain.py
    └── README.md
├── MIL-NCE
    ├── README.md
    ├── loss.py
    ├── main_distributed.py
    ├── s3dg.py
    └── video_loader.py
├── MLP-Mixer
    ├── README.md
    ├── models.py
    ├── models_mixer.py
    └── models_test.py
├── MoCo
    ├── MoCo.py
    └── README.md
├── NCF
    ├── Dataset.py
    ├── GMF.py
    ├── MLP.py
    ├── NeuMF.py
    ├── README.md
    └── evaluate.py
├── NFM
    ├── FM.py
    ├── NeuralFM.py
    └── README.md
├── NGCF
    ├── NGCF.py
    └── READEME.md
├── NTM
    ├── GSM.py
    ├── GSM_run.py
    ├── README.md
    └── vae.py
├── Non-local
    ├── README.md
    ├── network.py
    ├── non_local_concatenation.py
    ├── non_local_dot_product.py
    ├── non_local_embedded_gaussian.py
    └── non_local_gaussian.py
├── ONCF
    ├── ConvNCF.py
    ├── Dataset.py
    ├── MF_BPR.py
    ├── README.md
    └── saver.py
├── OpenPrompt
    ├── 0_basic.py
    └── README.md
├── README.md
├── RippleNet
    ├── README.md
    ├── data_loader.py
    ├── main.py
    ├── model.py
    ├── preprocess.py
    └── train.py
├── S2VT
    ├── Attention.py
    ├── DecoderRNN.py
    ├── EncoderRNN.py
    ├── README.md
    ├── S2VTAttModel.py
    └── S2VTModel.py
├── SR-GNN
    ├── README.md
    ├── main.py
    ├── model.py
    └── utils.py
├── Skip-Thought Vectors
    ├── Evaluate.py
    ├── README.md
    ├── data_loader.py
    ├── model.py
    ├── train.py
    └── vocab.py
├── SlowFast
    ├── README.md
    ├── keras_slowfast.py
    └── pytorch_slowfast.py
├── Transformer
    ├── README.md
    ├── Transformer.py
    └── nn.Transformer.py
├── UIE
    ├── README.md
    ├── convert.py
    └── model.py
├── VMT
    ├── README.md
    ├── dataloader.py
    ├── model.py
    ├── train.py
    └── utils.py
├── extractive_summarization
    ├── README.md
    └── extractive_summarization.py
└── node2vec
    ├── README.md
    ├── classify.py
    ├── node2vec.py
    └── node2vec_wiki.py


/ACMR/README.md:
--------------------------------------------------------------------------------
 1 | # Adversarial Cross-modal Retrieval (ACMR)
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | blog论文阅读笔记：https://blog.csdn.net/qq_39388410/article/details/105907097
 5 | 
 6 | #
 7 | 原paper：Bokun Wang, Yang Yang, Xing Xu, Alan Hanjalic, and Heng Tao Shen. "Adversarial Cross-Modal Retrieval". In Proceedings of 25th ACM International Conference on Multimedia (ACM MM), 2017.
 8 | 
 9 | 原code： https://github.com/sunpeng981712364/ACMR_demo
10 | 


--------------------------------------------------------------------------------
/ACMR/model/flip_gradient.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 18, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | from tensorflow.python.framework import ops
 8 | 
 9 | 
10 | class FlipGradientBuilder(object):
11 |     def __init__(self):
12 |         self.num_calls = 0
13 | 
14 |     def __call__(self, x, l=1.0):
15 |         grad_name = "FlipGradient%d" % self.num_calls
16 |         @ops.RegisterGradient(grad_name)
17 |         def _flip_gradients(op, grad):
18 |             #变成负梯度，因为adv需要完成max任务，变成负之后可以使用min
19 |             return [tf.negative(grad) * l] 
20 |         
21 |         g = tf.get_default_graph()
22 |         with g.gradient_override_map({"Identity": grad_name}):
23 |             y = tf.identity(x)
24 |             
25 |         self.num_calls += 1
26 |         return y
27 |     
28 | flip_gradient = FlipGradientBuilder()
29 | 


--------------------------------------------------------------------------------
/ACMR/train_adv_crossmodal_simple_nuswide.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 18, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | from models.adv_crossmodal_simple_nuswide import AdvCrossModalSimple, ModelParams
 8 | 
 9 | def main(_):
10 |     graph = tf.Graph()
11 |     model_params = ModelParams() #模型所有的参数
12 |     model_params.update() #更新文件夹路径。具体实现在adv_crossmodal_simple_nuswide
13 | 
14 |     with graph.as_default(): #默认图
15 |         model = AdvCrossModalSimple(model_params)
16 |     with tf.Session(graph=graph) as sess:
17 |         model.train(sess) #开始训练
18 |         #model.eval_random_rank()
19 |         model.eval(sess) #开始测试
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     tf.app.run() #启动图
24 | 


--------------------------------------------------------------------------------
/ACMR/train_adv_crossmodal_simple_wiki.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 18, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | from models.adv_crossmodal_simple_wiki import AdvCrossModalSimple, ModelParams
 8 | #from models.wiki_shallow import AdvCrossModalSimple, ModelParams
 9 | def main(_):
10 |     graph = tf.Graph()
11 |     model_params = ModelParams() #模型所有的参数
12 |     model_params.update() #更新文件夹路径。具体实现在adv_crossmodal_simple_wiki
13 | 
14 |     with graph.as_default(): #默认图
15 |         model = AdvCrossModalSimple(model_params)
16 |     with tf.Session(graph=graph) as sess:
17 |         model.train(sess) #开始训练
18 |         #model.eval_random_rank()
19 |         model.eval(sess) #开始测试
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     tf.app.run() #启动图
24 | 


--------------------------------------------------------------------------------
/ACMR/train_adv_crossmodal_triplet_wiki.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 18, 2020
 3 | @author: nakaizura
 4 | '''
 5 | import tensorflow as tf
 6 | from models.adv_crossmodal_triplet_wiki import AdvCrossModalSimple, ModelParams
 7 | #from models.wiki_shallow import AdvCrossModalSimple, ModelParams
 8 | def main(_):
 9 |     graph = tf.Graph()
10 |     model_params = ModelParams() #模型所有的参数
11 |     model_params.update() #更新文件夹路径。具体实现在adv_crossmodal_triplet_wiki
12 |     
13 |     with graph.as_default(): #默认图
14 |         model = AdvCrossModalSimple(model_params)
15 |     with tf.Session(graph=graph) as sess:
16 |         model.train(sess) #开始训练
17 |         #model.eval_random_rank()
18 |         model.eval(sess) #开始测试
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     tf.app.run() #启动图
23 | 


--------------------------------------------------------------------------------
/AFM/LoadData.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 18, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import numpy as np
  7 | import os
  8 | 
  9 | class LoadData(object):
 10 |     '''输入：数据集路径。返回：处理好格式的Train_data，Test_data和Validation_data。
 11 |     三个dataset都是字典集合, 'Y'是标签; 'X'是FM维度的one-hot向量，其中特征维度是features_M。
 12 |     '''
 13 | 
 14 |     #得到train.libfm，test.libfm和validation.libfm的路径。
 15 |     def __init__(self, path, dataset, loss_type="square_loss"):
 16 |         self.path = path + dataset + "/"
 17 |         self.trainfile = self.path + dataset +".train.libfm"
 18 |         self.testfile = self.path + dataset + ".test.libfm"
 19 |         self.validationfile = self.path + dataset + ".validation.libfm"
 20 |         self.features_M = self.map_features( ) #计算总特征数目
 21 |         self.Train_data, self.Validation_data, self.Test_data = self.construct_data( loss_type ) #处理libfm数据成矩阵形式
 22 | 
 23 |     def map_features(self): # 映射特征并保存在字典中，实际上是为了得到特征的总维度features_M
 24 |         self.features = {}
 25 |         self.read_features(self.trainfile)
 26 |         self.read_features(self.testfile)
 27 |         self.read_features(self.validationfile)
 28 |         # print("features_M:", len(self.features))
 29 |         return  len(self.features)
 30 | 
 31 |     def read_features(self, file): #读入特征文件
 32 |         f = open( file ) #打开文件
 33 |         line = f.readline() #逐行读取
 34 |         i = len(self.features)
 35 |         while line:
 36 |             items = line.strip().split(' ')
 37 |             for item in items[1:]:#第0列是Y，后面的是特征
 38 |                 if item not in self.features:#不在特征集合的就加入到特征集合中并计数，最后得到的是无重复的总特征数目features_M
 39 |                     self.features[ item ] = i
 40 |                     i = i + 1
 41 |             line = f.readline()
 42 |         f.close()
 43 | 
 44 |     def construct_data(self, loss_type):#构造Train_data，Test_data和Validation_data的数据
 45 |         X_, Y_ , Y_for_logloss= self.read_data(self.trainfile)
 46 |         #按照不同的loss，使用Y_或者Y_for_logloss
 47 |         if loss_type == 'log_loss':
 48 |             Train_data = self.construct_dataset(X_, Y_for_logloss)
 49 |         else:
 50 |             Train_data = self.construct_dataset(X_, Y_)
 51 |         #print("Number of samples in Train:" , len(Y_))
 52 | 
 53 |         X_, Y_ , Y_for_logloss= self.read_data(self.validationfile)
 54 |         if loss_type == 'log_loss':
 55 |             Validation_data = self.construct_dataset(X_, Y_for_logloss)
 56 |         else:
 57 |             Validation_data = self.construct_dataset(X_, Y_)
 58 |         #print("Number of samples in Validation:", len(Y_))
 59 | 
 60 |         X_, Y_ , Y_for_logloss = self.read_data(self.testfile)
 61 |         if loss_type == 'log_loss':
 62 |             Test_data = self.construct_dataset(X_, Y_for_logloss)
 63 |         else:
 64 |             Test_data = self.construct_dataset(X_, Y_)
 65 |         #print("Number of samples in Test:", len(Y_))
 66 | 
 67 |         return Train_data,  Validation_data,  Test_data
 68 | 
 69 |     def read_data(self, file):
 70 |         #读数据文件，对于每一行，数据的第一列是Y_
 71 |         #其他列会变成X_ 然后被映射到self.features里面保存。
 72 |         f = open( file )
 73 |         X_ = []
 74 |         Y_ = []
 75 |         Y_for_logloss = []#离散后的Y_，对应着两种数据集构造方法，视方法而构造不同的形态。
 76 |         line = f.readline()
 77 |         while line:
 78 |             items = line.strip().split(' ')
 79 |             Y_.append( 1.0*float(items[0]) )
 80 | 
 81 |             if float(items[0]) > 0:# 第一列如果>0则视为1否则就认为是0
 82 |                 v = 1.0
 83 |             else:
 84 |                 v = 0.0
 85 |             Y_for_logloss.append( v )
 86 | 
 87 |             X_.append( [ self.features[item] for item in items[1:]] )#其他列都放入到X_中
 88 |             line = f.readline()#读下一行
 89 |         f.close()
 90 |         return X_, Y_, Y_for_logloss
 91 | 
 92 |     def construct_dataset(self, X_, Y_):
 93 |         Data_Dic = {}
 94 |         X_lens = [ len(line) for line in X_] #每个样本的特征数量
 95 |         indexs = np.argsort(X_lens) #从小到大的索引值
 96 |         Data_Dic['Y'] = [ Y_[i] for i in indexs] #按索引构造数据集
 97 |         Data_Dic['X'] = [ X_[i] for i in indexs]
 98 |         return Data_Dic
 99 |     
100 |     def truncate_features(self):
101 |         """
102 |         确保每个特征的长度都是一致的，所以按照样本的特征长度（最小的）对其他特征进行截断
103 |         """
104 |         num_variable = len(self.Train_data['X'][0])
105 |         for i in xrange(len(self.Train_data['X'])):#找到最小的长度
106 |             num_variable = min([num_variable, len(self.Train_data['X'][i])])
107 |         #截断train, validation and test
108 |         for i in xrange(len(self.Train_data['X'])):
109 |             self.Train_data['X'][i] = self.Train_data['X'][i][0:num_variable]
110 |         for i in xrange(len(self.Validation_data['X'])):
111 |             self.Validation_data['X'][i] = self.Validation_data['X'][i][0:num_variable]
112 |         for i in xrange(len(self.Test_data['X'])):
113 |             self.Test_data['X'][i] = self.Test_data['X'][i][0:num_variable]
114 |         return num_variable
115 | 


--------------------------------------------------------------------------------
/AFM/README.md:
--------------------------------------------------------------------------------
 1 | # Attentional Factorization Machine(AFM)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog论文阅读笔记：https://blog.csdn.net/qq_39388410/article/details/85119300
 6 | 
 7 | 建议阅读顺序：LoadData-->FM-->AFM
 8 | 
 9 | # 
10 | 
11 | 原paper： Jun Xiao, Hao Ye, Xiangnan He, Hanwang Zhang, Fei Wu and Tat-Seng Chua (2017). Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks IJCAI, Melbourne, Australia, August 19-25, 2017.
12 | 
13 | 原code： https://github.com/hexiangnan/attentional_factorization_machine
14 | 


--------------------------------------------------------------------------------
/BERT/README.md:
--------------------------------------------------------------------------------
 1 | # Bidirectional Encoder Representation from Transformer（BERT）
 2 | 
 3 | 模型部分源码阅读笔记....（工程量真的有点大，暂时只啃了模型处理的部分，待补）
 4 | 
 5 | blog整理：https://blog.csdn.net/qq_39388410/article/details/102136315
 6 | 
 7 | #
 8 | 
 9 | 原paper：https://arxiv.org/abs/1810.04805
10 | 
11 | 原code：https://github.com/google-research/bert
12 | 
13 | #
14 | 
15 | pytorch版本代码：https://github.com/huggingface/transformers
16 | 


--------------------------------------------------------------------------------
/BPR/README.md:
--------------------------------------------------------------------------------
 1 | # Bayesian Personalized Ranking(BPR)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/85160996
 6 | 
 7 | # 
 8 | 
 9 | 原paper： Bayesian Personalized Ranking(BPR) is a learning algorithm for collaborative filtering first introduced in: BPR: Bayesian Personalized Ranking from Implicit Feedback. Steffen Rendle, Christoph Freudenthaler, Zeno Gantner and Lars Schmidt-Thieme, Proc. UAI 2009.
10 | 
11 | 原code： https://github.com/dongx-duan/bpr
12 | 


--------------------------------------------------------------------------------
/C3D/C3D_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 22, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch.nn as nn
 7 | 
 8 | 
 9 | class C3D(nn.Module):
10 |     """
11 |     C3D模型的实现挺简单的。
12 |     想要得到时序上的关系，那么把时序也纳入卷积，即二维卷积变三维，实现可以看作是3D版VGG
13 |     """
14 | 
15 |     def __init__(self):
16 |         super(C3D, self).__init__()
17 |         # 所有3D卷积滤波器均为3×3×3，步长为1×1×1
18 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
19 |         # 为了保持早期的时间信息，pool1有些许不同
20 |         self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
21 | 
22 |         self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
23 |         self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
24 | 
25 |         self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
26 |         self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
27 |         self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
28 | 
29 |         self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
30 |         self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
31 |         self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
32 | 
33 |         self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
34 |         self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
35 |         self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
36 | 
37 |         # 3个FC输出单元
38 |         self.fc6 = nn.Linear(8192, 4096)
39 |         self.fc7 = nn.Linear(4096, 4096)
40 |         self.fc8 = nn.Linear(4096, 487) #最后对应着487个类别（视数据集而定）
41 | 
42 |         self.dropout = nn.Dropout(p=0.5)
43 | 
44 |         self.relu = nn.ReLU()
45 |         self.softmax = nn.Softmax() #得到对应类别的预测概率
46 | 
47 |     def forward(self, x):
48 |         # forward按模型架构图来就好
49 |         h = self.relu(self.conv1(x))
50 |         h = self.pool1(h)
51 | 
52 |         h = self.relu(self.conv2(h))
53 |         h = self.pool2(h)
54 | 
55 |         h = self.relu(self.conv3a(h))
56 |         h = self.relu(self.conv3b(h))
57 |         h = self.pool3(h)
58 | 
59 |         h = self.relu(self.conv4a(h))
60 |         h = self.relu(self.conv4b(h))
61 |         h = self.pool4(h)
62 | 
63 |         h = self.relu(self.conv5a(h))
64 |         h = self.relu(self.conv5b(h))
65 |         h = self.pool5(h)
66 | 
67 |         h = h.view(-1, 8192)
68 |         h = self.relu(self.fc6(h))
69 |         h = self.dropout(h)
70 |         h = self.relu(self.fc7(h))
71 |         h = self.dropout(h)
72 | 
73 |         logits = self.fc8(h)
74 |         probs = self.softmax(logits)
75 | 
76 |         return probs
77 | 
78 | 


--------------------------------------------------------------------------------
/C3D/README.md:
--------------------------------------------------------------------------------
 1 | # 3D Convolutional Networks(C3D)
 2 | 逐行源码阅读中文笔记。C3D的实现很简单，对着模型架构直接实现就ok。
 3 | 
 4 | blog解读：https://blog.csdn.net/qq_39388410/article/details/104951012
 5 | 
 6 | #  
 7 | 
 8 | 原paper： Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks." 
 9 | Proceedings of the IEEE international conference on computer vision. 2015.
10 | 
11 | 参考code： https://github.com/DavideA/c3d-pytorch
12 | 
13 | 另外还有个即插即玩的code也挺不错：https://github.com/jfzhang95/pytorch-video-recognition
14 | 
15 | 或者直接使用商汤OpenMMLab的开源。
16 | 


--------------------------------------------------------------------------------
/C3D/predict.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 22, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import numpy as np
 7 | 
 8 | import torch
 9 | from torch.autograd import Variable
10 | 
11 | from os.path import join
12 | from glob import glob
13 | 
14 | import skimage.io as io
15 | from skimage.transform import resize
16 | 
17 | from C3D_model import C3D
18 | 
19 | 
20 | def get_sport_clip(clip_name, verbose=True):
21 |     """
22 |     载入视频片段给C3D做分类。
23 |     """
24 |     #载入并中心crop一下
25 |     clip = sorted(glob(join('data', clip_name, '*.png')))#视频由多个图片帧组成的
26 |     clip = np.array([resize(io.imread(frame), output_shape=(112, 200), preserve_range=True) for frame in clip])
27 |     clip = clip[:, :, 44:44+112, :]  # crop centrally
28 | 
29 |     if verbose: #如果为True就显示视频
30 |         clip_img = np.reshape(clip.transpose(1, 0, 2, 3), (112, 16 * 112, 3))
31 |         io.imshow(clip_img.astype(np.uint8))
32 |         io.show()
33 | 
34 |     clip = clip.transpose(3, 0, 1, 2)  # 变换维度为ch, fr, h, w，即通道 帧 高 宽
35 |     clip = np.expand_dims(clip, axis=0)  # 增加一维batch axis
36 |     clip = np.float32(clip)
37 | 
38 |     return torch.from_numpy(clip) #(n, ch, fr, h, w)
39 | 
40 | 
41 | def read_labels_from_file(filepath):
42 |     """
43 |     读入真实的标签，这里用的是Sport1M，所以返回的都是动作标签。
44 |     """
45 |     with open(filepath, 'r') as f:
46 |         labels = [line.strip() for line in f.readlines()]
47 |     return labels
48 | 
49 | 
50 | def main():
51 |     """
52 |     主函数
53 |     """
54 | 
55 |     # 载入视频片段做预测
56 |     X = get_sport_clip('roger') #roger视频
57 |     X = Variable(X)
58 |     X = X.cuda() #GPU
59 | 
60 |     # 载入预训练好了的模型权重
61 |     net = C3D() #模型实例化
62 |     net.load_state_dict(torch.load('c3d.pickle')) #填入权重
63 |     net.cuda()
64 |     net.eval()# 调到测试模式
65 | 
66 |     # 然后直接拿网络预测就好
67 |     prediction = net(X)
68 |     prediction = prediction.data.cpu().numpy()
69 | 
70 |     # 读入真实标签
71 |     labels = read_labels_from_file('labels.txt')
72 | 
73 |     # 得到topN的预测类别
74 |     top_inds = prediction[0].argsort()[::-1][:5]
75 |     print('\nTop 5:')
76 |     for i in top_inds:
77 |         print('{:.5f} {}'.format(prediction[0][i], labels[i]))
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/CLIP/README.md:
--------------------------------------------------------------------------------
 1 | # Learning Transferable Visual Models From Natural Language Supervision (CLIP)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://nakaizura.blog.csdn.net/article/details/116903995
 6 | 
 7 | # 
 8 | 
 9 | 原code： https://github.com/openai/CLIP
10 | 


--------------------------------------------------------------------------------
/CTRL/README.md:
--------------------------------------------------------------------------------
 1 | # TALL: Temporal Activity Localization via Language Query(CTRL)
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | blog关于跨模态视频时刻检索的综述：https://blog.csdn.net/qq_39388410/article/details/107316185
 5 | 
 6 | #
 7 | 
 8 | 原paper： Jiyang Gao, Chen Sun, Zhenheng Yang, and Ram Nevatia. 2017. TALL: Temporal Activity Localization via Language Query. In Proceedings of the IEEE International Conference on Computer Vision. IEEE, 5277–5285.
 9 | 
10 | 原code： https://github.com/jiyanggao/TALL
11 | 


--------------------------------------------------------------------------------
/CTRL/TALL.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 13, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | import numpy as np
10 | from torch.autograd import Variable
11 | 
12 | 
13 | def weights_init(m):#初始化权重
14 |     classname = m.__class__.__name__
15 |     if classname.find('Conv') != -1:
16 |         torch.nn.init.normal_(m.weight.data, mean=0, std=0.01)#均值0方差0.01的高斯分布
17 |         m.bias.data.fill_(0)
18 |     elif classname.find('Linear') != -1:
19 |         torch.nn.init.normal_(m.weight.data)
20 |         m.bias.data.fill_(0)
21 | 
22 | 
23 | class TALL(nn.Module):
24 |     def __init__(self):
25 |         super(TALL, self).__init__()
26 |         self.semantic_size = 1024 # 视觉和文本要投影的共同语义维度
27 |         self.sentence_embedding_size = 4800 #sentence2vec得到的维度
28 |         self.visual_feature_dim = 4096*3 #中心+上下文一共3个，每个由C3D得到是4096维
29 |         self.v2s_lt = nn.Linear(self.visual_feature_dim, self.semantic_size) #投影视觉
30 |         self.s2s_lt = nn.Linear(self.sentence_embedding_size, self.semantic_size) #投影文本
31 |         self.fc1 = torch.nn.Conv2d(4096, 1000, kernel_size=1, stride=1)#2层FC得到预测结果
32 |         self.fc2 = torch.nn.Conv2d(1000, 3, kernel_size=1, stride=1)
33 |         # 初始化权重
34 |         self.apply(weights_init)
35 | 
36 |     def cross_modal_comb(self, visual_feat, sentence_embed):
37 |         #这是完成特征交叉的模块，会分别做加法、乘法和拼接
38 |         batch_size = visual_feat.size(0)
39 |         # shape_matrix = torch.zeros(batch_size,batch_size,self.semantic_size)
40 | 
41 |         #因为视频会有多个，而句子只有一个，所以要做一下维度变化
42 |         vv_feature = visual_feat.expand([batch_size,batch_size,self.semantic_size])
43 |         ss_feature = sentence_embed.repeat(1,1,batch_size).view(batch_size,batch_size,self.semantic_size)
44 | 
45 |         concat_feature = torch.cat([vv_feature, ss_feature], 2)#横向拼接（第0维度是batch）
46 | 
47 |         mul_feature = vv_feature * ss_feature # 56,56,1024，乘法
48 |         add_feature = vv_feature + ss_feature # 56,56,1024，加法
49 | 
50 |         #将各个特征一起合并起来得到组合特征
51 |         comb_feature = torch.cat([mul_feature, add_feature, concat_feature], 2)
52 | 
53 |         return comb_feature
54 | 
55 | 
56 |     def forward(self, visual_feature_train, sentence_embed_train):
57 |         #对视觉特征投影到语义空间并norm
58 |         transformed_clip_train = self.v2s_lt(visual_feature_train)
59 |         transformed_clip_train_norm = F.normalize(transformed_clip_train, p=2, dim=1)
60 | 
61 |         #对本文特征投影到语义空间并norm
62 |         transformed_sentence_train = self.s2s_lt(sentence_embed_train)
63 |         transformed_sentence_train_norm = F.normalize(transformed_sentence_train, p=2, dim=1)
64 | 
65 |         #做特征交叉
66 |         cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm)
67 | 
68 |         cross_modal_vec_train = cross_modal_vec_train.unsqueeze(0).permute(0, 3, 1, 2)
69 |         #2层FC得到预测结果
70 |         mid_output = self.fc1(cross_modal_vec_train)
71 |         mid_output = F.relu(mid_output)
72 |         sim_score_mat_train = self.fc2(mid_output).squeeze(0)
73 | 
74 |         return sim_score_mat_train
75 | 


--------------------------------------------------------------------------------
/Capsules/README.md:
--------------------------------------------------------------------------------
 1 | # Dynamic Routing Between Capsules(CapsNet)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/104954161
 6 | 
 7 | # 
 8 | 
 9 | 原paper： Dynamic Routing Between Capsules
10 | 
11 | 原code： https://github.com/Sarasra/models/tree/master/research/capsules
12 | 


--------------------------------------------------------------------------------
/DCGAN/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Convolutional Generative Adversarial Networks(DCGAN)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/88426976
 6 | 
 7 | 建议阅读顺序：dcgan_keras.py-->dcgan_pytorch.py
 8 | 
 9 | # 
10 | 
11 | 原paper： Unsupervised Representation with Deep Convolutional Generative Adversarial Networks
12 | 
13 | keras code： https://github.com/jacobgil/keras-dcgan
14 | 


--------------------------------------------------------------------------------
/DIN/Dice.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 29, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | 
 9 | #Dice是DIN自己独特的激活函数。认为ReLU系列的分割点都是0，这个分割点应该由数据决定。
10 | #主要是通过改造Parametric ReLU，将alpha根据数据分布（期望和方差）来调整。
11 | #优点：根据数据分布灵活调整阶跃变化点，具有BN的优点
12 | #缺点：BN复杂度，比较耗时
13 | 
14 | def dice(_x, axis=-1, epsilon=0.000000001, name=''):
15 |   #Data Adaptive Activation Function
16 |   with tf.variable_scope(name_or_scope='', reuse=tf.AUTO_REUSE):
17 |     alphas = tf.get_variable('alpha'+name, _x.get_shape()[-1],                                  
18 |                          initializer=tf.constant_initializer(0.0),                         
19 |                          dtype=tf.float32)
20 |     beta = tf.get_variable('beta'+name, _x.get_shape()[-1],                                  
21 |                          initializer=tf.constant_initializer(0.0),                         
22 |                          dtype=tf.float32)
23 |   input_shape = list(_x.get_shape())
24 | 
25 |   reduction_axes = list(range(len(input_shape)))
26 |   del reduction_axes[axis]
27 |   broadcast_shape = [1] * len(input_shape)
28 |   broadcast_shape[axis] = input_shape[axis]
29 |                                                                                                                                                                             
30 |   # case: train mode (uses stats of the current batch)
31 |   #计算batch的均值和方差
32 |   mean = tf.reduce_mean(_x, axis=reduction_axes)
33 |   brodcast_mean = tf.reshape(mean, broadcast_shape)
34 |   std = tf.reduce_mean(tf.square(_x - brodcast_mean) + epsilon, axis=reduction_axes)
35 |   std = tf.sqrt(std)
36 |   brodcast_std = tf.reshape(std, broadcast_shape)
37 |   x_normed = tf.layers.batch_normalization(_x, center=False, scale=False, name=name, reuse=tf.AUTO_REUSE)
38 |   # x_normed = (_x - brodcast_mean) / (brodcast_std + epsilon)
39 |   x_p = tf.sigmoid(beta * x_normed)
40 |  
41 |   
42 |   return alphas * (1.0 - x_p) * _x + x_p * _x #根据原文中给的公式计算
43 | 
44 | def parametric_relu(_x):
45 |   #PRELU激活函数，形式上和leakReLU很像，只是它的alpha可学习
46 |   #alpha=0，退化成ReLU。alpha不更新，退化成Leak
47 |   with tf.variable_scope(name_or_scope='', reuse=tf.AUTO_REUSE):
48 |     alphas = tf.get_variable('alpha', _x.get_shape()[-1],
49 |                          initializer=tf.constant_initializer(0.0),
50 |                          dtype=tf.float32)
51 |   pos = tf.nn.relu(_x)
52 |   neg = alphas * (_x - abs(_x)) * 0.5 #用alpha控制
53 | 
54 |   return pos + neg
55 | 


--------------------------------------------------------------------------------
/DIN/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Interest Network(DIN)
 2 | 
 3 | 源码笔记：用Attention让每个用户预测关注的兴趣点（行为向量）不同。不过感觉这份源码可读性不太好。
 4 | 
 5 | 建议阅读顺序：build_dataset-->input-->Dice-->model-->train
 6 | 
 7 | #
 8 | 
 9 | 原paper：
10 | ```
11 | @article{Zhou2017Deep,
12 |   title={Deep Interest Network for Click-Through Rate Prediction},
13 |   author={Zhou, Guorui and Song, Chengru and Zhu, Xiaoqiang and Ma, Xiao and Yan, Yanghui and Dai, Xingya and Zhu, Han and Jin, Junqi and Li, Han and Gai, Kun},
14 |   year={2017},
15 | }
16 | ```
17 | 
18 | 原code： https://github.com/zhougr1993/DeepInterestNetwork
19 | 


--------------------------------------------------------------------------------
/DIN/build_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 29, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import random
 7 | import pickle
 8 | 
 9 | random.seed(1234)#可复现随机种子
10 | 
11 | #读源文件构造数据集
12 | 
13 | with open('../raw_data/remap.pkl', 'rb') as f:#读raw的原数据
14 |   reviews_df = pickle.load(f)
15 |   cate_list = pickle.load(f) #cate是类别categories
16 |   user_count, item_count, cate_count, example_count = pickle.load(f)
17 | 
18 | train_set = []
19 | test_set = []
20 | for reviewerID, hist in reviews_df.groupby('reviewerID'):
21 |   pos_list = hist['asin'].tolist()#得到正例
22 |   def gen_neg():#生成负例
23 |     neg = pos_list[0]
24 |     while neg in pos_list:#如果负例在正例中了，就随机再采样
25 |       neg = random.randint(0, item_count-1)#在item集合中采样
26 |     return neg
27 |   neg_list = [gen_neg() for i in range(len(pos_list))]
28 | 
29 |   for i in range(1, len(pos_list)):
30 |     hist = pos_list[:i]
31 |     if i != len(pos_list) - 1: #把正负例放进数据集里面
32 |       train_set.append((reviewerID, hist, pos_list[i], 1))
33 |       train_set.append((reviewerID, hist, neg_list[i], 0))
34 |     else:
35 |       label = (pos_list[i], neg_list[i])
36 |       test_set.append((reviewerID, hist, label))
37 | 
38 | random.shuffle(train_set) #打乱数据集
39 | random.shuffle(test_set)
40 | 
41 | assert len(test_set) == user_count
42 | # assert(len(test_set) + len(train_set) // 2 == reviews_df.shape[0])
43 | 
44 | with open('dataset.pkl', 'wb') as f: #保存处理后的数据集
45 |   pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
46 |   pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
47 |   pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
48 |   pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)
49 | 


--------------------------------------------------------------------------------
/DIN/input.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 29, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import numpy as np
 7 | 
 8 | #输入数据集的类
 9 | 
10 | class DataInput:
11 |   def __init__(self, data, batch_size):
12 | 
13 |     self.batch_size = batch_size
14 |     self.data = data
15 |     self.epoch_size = len(self.data) // self.batch_size
16 |     if self.epoch_size * self.batch_size < len(self.data):
17 |       self.epoch_size += 1
18 |     self.i = 0
19 | 
20 |   def __iter__(self):
21 |     return self
22 | 
23 |   def next(self):
24 | 
25 |     if self.i == self.epoch_size: #已经到epoch了就停止
26 |       raise StopIteration
27 | 
28 |     #否则就按batch大小构造batch
29 |     ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size,
30 |                                                   len(self.data))]
31 |     self.i += 1
32 | 
33 |     u, i, y, sl = [], [], [], [] #把ts的数据分别存到几个列表中
34 |     for t in ts:
35 |       u.append(t[0])
36 |       i.append(t[2])
37 |       y.append(t[3])
38 |       sl.append(len(t[1]))#记录每个ts数据第1维的长度，即用户历史行为长度
39 |     max_sl = max(sl)#因为后面要计算兴趣分布，所以需要把长度统一一下
40 | 
41 |     hist_i = np.zeros([len(ts), max_sl], np.int64)
42 | 
43 |     k = 0
44 |     for t in ts:
45 |       for l in range(len(t[1])):
46 |         hist_i[k][l] = t[1][l] #有交互的记录
47 |       k += 1
48 | 
49 |     return self.i, (u, i, y, hist_i, sl)
50 | 
51 | class DataInputTest:
52 |   #test数据集的输入，和上一个类逻辑一致
53 |   def __init__(self, data, batch_size):
54 | 
55 |     self.batch_size = batch_size
56 |     self.data = data
57 |     self.epoch_size = len(self.data) // self.batch_size
58 |     if self.epoch_size * self.batch_size < len(self.data):
59 |       self.epoch_size += 1
60 |     self.i = 0
61 | 
62 |   def __iter__(self):
63 |     return self
64 | 
65 |   def next(self):
66 | 
67 |     if self.i == self.epoch_size:
68 |       raise StopIteration
69 | 
70 |     ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size,
71 |                                                   len(self.data))]
72 |     self.i += 1
73 | 
74 |     u, i, j, sl = [], [], [], []
75 |     for t in ts:
76 |       u.append(t[0])
77 |       i.append(t[2][0])
78 |       j.append(t[2][1])
79 |       sl.append(len(t[1]))
80 |     max_sl = max(sl)
81 | 
82 |     hist_i = np.zeros([len(ts), max_sl], np.int64)
83 | 
84 |     k = 0
85 |     for t in ts:
86 |       for l in range(len(t[1])):
87 |         hist_i[k][l] = t[1][l]
88 |       k += 1
89 | 
90 |     return self.i, (u, i, j, hist_i, sl)
91 | 


--------------------------------------------------------------------------------
/DIN/train.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 29, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import os
  7 | import time
  8 | import pickle
  9 | import random
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | import sys
 13 | from input import DataInput, DataInputTest
 14 | from model import Model
 15 | 
 16 | os.environ['CUDA_VISIBLE_DEVICES'] = '1'
 17 | random.seed(1234)
 18 | np.random.seed(1234)
 19 | tf.set_random_seed(1234)
 20 | 
 21 | train_batch_size = 32
 22 | test_batch_size = 512
 23 | predict_batch_size = 32
 24 | predict_users_num = 1000
 25 | predict_ads_num = 100
 26 | 
 27 | with open('dataset.pkl', 'rb') as f:
 28 |   train_set = pickle.load(f)
 29 |   test_set = pickle.load(f)
 30 |   cate_list = pickle.load(f)
 31 |   user_count, item_count, cate_count = pickle.load(f)
 32 | 
 33 | best_auc = 0.0
 34 | def calc_auc(raw_arr):
 35 |     """Summary
 36 |     Args:
 37 |         raw_arr (TYPE): Description
 38 |     Returns:
 39 |         TYPE: Description
 40 |     """
 41 |     #根据预测的结果正排
 42 |     arr = sorted(raw_arr, key=lambda d:d[2])
 43 | 
 44 |     auc = 0.0
 45 |     fp1, tp1, fp2, tp2 = 0.0, 0.0, 0.0, 0.0
 46 |     for record in arr:
 47 |         fp2 += record[0] # 没有点击
 48 |         tp2 += record[1] # 点击了
 49 |         auc += (fp2 - fp1) * (tp2 + tp1)
 50 |         fp1, tp1 = fp2, tp2
 51 | 
 52 |     # if all nonclick or click, disgard
 53 |     threshold = len(arr) - 1e-3
 54 |     if tp2 > threshold or fp2 > threshold:
 55 |         return -0.5
 56 | 
 57 |     if tp2 * fp2 > 0.0:  # normal auc
 58 |         return (1.0 - auc / (2.0 * tp2 * fp2))
 59 |     else:
 60 |         return None
 61 | 
 62 | def _auc_arr(score):
 63 |   #把score变数组形式
 64 |   score_p = score[:,0]
 65 |   score_n = score[:,1]
 66 |   #print "============== p ============="
 67 |   #print score_p
 68 |   #print "============== n ============="
 69 |   #print score_n
 70 |   score_arr = []
 71 |   for s in score_p.tolist():
 72 |     score_arr.append([0, 1, s])
 73 |   for s in score_n.tolist():
 74 |     score_arr.append([1, 0, s])
 75 |   return score_arr
 76 | def _eval(sess, model):
 77 |   #评估模型
 78 |   auc_sum = 0.0
 79 |   score_arr = []
 80 |   for _, uij in DataInputTest(test_set, test_batch_size):#载入数据
 81 |     auc_, score_ = model.eval(sess, uij)#得到分数
 82 |     score_arr += _auc_arr(score_)
 83 |     auc_sum += auc_ * len(uij[0])#计算auc
 84 |   test_gauc = auc_sum / len(test_set)
 85 |   Auc = calc_auc(score_arr)
 86 |   global best_auc
 87 |   if best_auc < test_gauc:
 88 |     best_auc = test_gauc#记录最好的auc
 89 |     model.save(sess, 'save_path/ckpt')
 90 |   return test_gauc, Auc
 91 | 
 92 | def _test(sess, model):
 93 |   #测试模型
 94 |   auc_sum = 0.0
 95 |   score_arr = []
 96 |   predicted_users_num = 0
 97 |   print "test sub items"
 98 |   for _, uij in DataInputTest(test_set, predict_batch_size):#载入数据
 99 |     if predicted_users_num >= predict_users_num:
100 |         break
101 |     score_ = model.test(sess, uij)#得到预测分数
102 |     score_arr.append(score_)
103 |     predicted_users_num += predict_batch_size
104 |   return score_[0]
105 | 
106 | gpu_options = tf.GPUOptions(allow_growth=True)#gpu
107 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
108 | 
109 |   model = Model(user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num)
110 |   sess.run(tf.global_variables_initializer())
111 |   sess.run(tf.local_variables_initializer())#实例化模型
112 | 
113 |   print('test_gauc: %.4f\t test_auc: %.4f' % _eval(sess, model))
114 |   sys.stdout.flush()
115 |   lr = 1.0
116 |   start_time = time.time()
117 |   for _ in range(50):
118 | 
119 |     random.shuffle(train_set)#打乱数据集
120 | 
121 |     epoch_size = round(len(train_set) / train_batch_size)
122 |     loss_sum = 0.0
123 |     for _, uij in DataInput(train_set, train_batch_size):
124 |       loss = model.train(sess, uij, lr)#开始训练
125 |       loss_sum += loss
126 | 
127 |       if model.global_step.eval() % 1000 == 0:#打印过程
128 |         test_gauc, Auc = _eval(sess, model)
129 |         print('Epoch %d Global_step %d\tTrain_loss: %.4f\tEval_GAUC: %.4f\tEval_AUC: %.4f' %
130 |               (model.global_epoch_step.eval(), model.global_step.eval(),
131 |                loss_sum / 1000, test_gauc, Auc))
132 |         sys.stdout.flush()#stdout刷新
133 |         loss_sum = 0.0
134 | 
135 |       if model.global_step.eval() % 336000 == 0:
136 |         lr = 0.1
137 | 
138 |     print('Epoch %d DONE\tCost time: %.2f' %
139 |           (model.global_epoch_step.eval(), time.time()-start_time))
140 |     sys.stdout.flush()
141 |     model.global_epoch_step_op.eval()
142 | 
143 |   print('best test_gauc:', best_auc)
144 |   sys.stdout.flush()
145 | 


--------------------------------------------------------------------------------
/DRL-REC/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Reinforcement Learning for List-wise Recommendations
 2 | 
 3 | 源码笔记：强化学习做交互式List推荐，京东的论文所以无数据，是由模拟器实时模拟出用户，然后和用户做交互。
 4 | 
 5 | paper下载：https://arxiv.org/abs/1801.00209
 6 | 
 7 | 建议阅读顺序：simulator-->pre_process_data-->replay_buffer-->build_estimator
 8 | 
 9 | #
10 | 
11 | 原paper： Deep Reinforcement Learning for List-wise Recommendations
12 | 
13 | 原code： https://github.com/luozachary/drl-rec
14 | 


--------------------------------------------------------------------------------
/DRL-REC/pre_process_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 28, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import itertools
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | #这个py主要是随机生成数据集...
 11 | 
 12 | 
 13 | SIGMA = 0.9 #折损因子
 14 | 
 15 | 
 16 | def calculate_reward(row):
 17 |     r = 0.
 18 |     for i, v in enumerate(row['reward'].split('|')):#reward按折损因子递减
 19 |         r += np.power(SIGMA, i) * (0 if v == "show" else 1)#只show是0
 20 |     return r
 21 | 
 22 | 
 23 | def process_data(data_path, recall_path):
 24 |     #载入数据
 25 |     data = pd.read_csv(data_path, sep='\t')
 26 |     for org in ["state", "action", "n_state"]:
 27 |         target = org + "_float"
 28 |         data[target] = data.apply(
 29 |             lambda row: [item for sublist in
 30 |                          list(map(lambda t: list(np.array(t.split(','), dtype=np.float64)), row[org].split('|')))
 31 |                          for item in sublist
 32 |                          ], axis=1
 33 |         )
 34 |     data['reward_float'] = data.apply(calculate_reward, axis=1)
 35 | 
 36 |     recall_data = pd.read_csv(recall_path, sep='\t')
 37 |     recall_data['embed_float'] = recall_data.apply(
 38 |         lambda row: np.array(row['embedding'][1:-1].split(','), dtype=np.float64).tolist(), axis=1
 39 |     )
 40 |     recall_tmp = list()
 41 |     for idx, row in recall_data.iterrows():
 42 |         for i in range(4):
 43 |             recall_tmp.append(row['embed_float'][i * 30: (i + 1) * 30])
 44 |     recall_tmp.sort()
 45 |     recall = list(l for l, _ in itertools.groupby(recall_tmp))
 46 | 
 47 |     return data, recall
 48 | 
 49 | 
 50 | def gen_samples(id_num=100, sample_size=256):
 51 |     #生成数据集
 52 |     from pandas import DataFrame
 53 |     ids = np.random.randint(0, 100, size=id_num) #随机生成整数id
 54 |     ids = [str(idx) for idx in ids]
 55 |     embeddings = np.random.randn(id_num, 30)
 56 |     id_emb_dic = dict(zip(ids, embeddings))#得到embedding
 57 |     #这个recall空间是之后网络得到一个最大得分action，可对应在该空间选择一个item作为推荐
 58 |     colunms_name = ['state', 'action', 'reward', 'n_state', 'recall']
 59 |     sample_data = []
 60 |     for i in range(sample_size): #256个采样大小
 61 |         #五个维度都是随机的
 62 |         state_len = np.random.randint(1, 12)
 63 |         state = [str(val) for val in np.random.choice(ids, size=state_len)]
 64 |         n_state = [str(val) for val in np.random.choice(ids, size=state_len)]
 65 |         action = str(np.random.choice(ids, size=2)[0])
 66 |         reward = np.random.rand()
 67 |         recall = [action]
 68 |         sample_data.append((state, action, reward, n_state, recall))#加入列表中
 69 |     data = DataFrame(sample_data, columns=colunms_name)
 70 |     write_file(id_emb_dic, sample_data)
 71 |     return id_emb_dic, data
 72 | 
 73 | 
 74 | def write_file(embedding_file, sample_data):
 75 |     #把随机生成的结果写到文件中
 76 |     with open("embed.csv", "w") as fout:
 77 |         head = 'item_id\tembedding\n'
 78 |         fout.write(head)
 79 |         for item_id, emb in embedding_file.items():
 80 |             emb_str = ','.join([str(v) for v in emb])
 81 |             outline = '%s\t%s\n' % (item_id, emb_str)#商品id和embedding
 82 |             fout.write(outline)
 83 |     print("wrote embedding done")
 84 | 
 85 |     with open("train.csv", "w") as fout_sample:
 86 |         columns_name = ['state', 'action', 'reward', 'n_state', 'recall']
 87 |         head = '%s\n' % ('\t'.join(columns_name))
 88 |         fout_sample.write(head)
 89 |         for sample in sample_data:
 90 |             s_state = '|'.join(sample[0])
 91 |             action = sample[1]
 92 |             s_reward = str(sample[2])
 93 |             s_n_state = '|'.join(sample[3])
 94 |             s_recall = '|'.join(sample[4])#写入状态
 95 |             outline = '{}\t{}\t{}\t{}\t{}\n'.format(s_state, action, s_reward, s_n_state, s_recall)
 96 |             fout_sample.write(outline)
 97 |     print("wrote sample data done")
 98 | 
 99 | 
100 | data, recall_data = process_data("train.csv", "embed.csv")
101 | 


--------------------------------------------------------------------------------
/DRL-REC/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 28, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | from collections import deque
 7 | from simulator import data
 8 | import random
 9 | 
10 | #经验重放的记忆 M
11 | 
12 | class RelayBuffer(object):
13 |     def __init__(self, buffer_size):
14 |         self.buffer_size = buffer_size
15 |         self.count = 0
16 |         self.buffer = deque()
17 |         for idx, row in data.iterrows():
18 |             sample = list()
19 |             sample.append(row['state_float'])#当前状态
20 |             sample.append(row['action_float'])#动作
21 |             sample.append(row['reward_float'])#奖励
22 |             sample.append(row['n_state_float'])#接下来的状态
23 |             self.buffer.append(sample)
24 | 
25 |     def add(self, state, action, reward, next_reward):
26 |         #存入（状态，动作，动作的奖励，下一个状态）
27 |         experience = (state, action, reward, next_reward)
28 |         if self.count < self.buffer_size:
29 |             self.buffer.append(experience)
30 |             self.count += 1
31 |         else: #如果M满了
32 |             self.buffer.popleft()#出队最早的记忆
33 |             self.buffer.append(experience)#再把自己加进去
34 | 
35 |     def size(self):
36 |         return self.count #存储M当前容量
37 | 
38 |     def sample_batch(self, batch_size):#随机采样
39 |         return random.sample(self.buffer, batch_size)
40 | 
41 |     def clear(self):#M清零
42 |         self.buffer.clear()
43 |         self.count = 0
44 | 


--------------------------------------------------------------------------------
/DRL-REC/simulator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 28, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | from pre_process_data import data
  7 | import numpy as np
  8 | 
  9 | #线上User-Agent交互仿真环境构建
 10 | #仿真器主要基于历史数据，所以先存储历史真实数据的((state,action)-reward)，再模拟仿真生成
 11 | 
 12 | class Simulator(object):
 13 |     def __init__(self, alpha=0.5, sigma=0.9):
 14 |         self.data = data
 15 |         self.alpha = alpha
 16 |         self.sigma = sigma
 17 |         self.init_state = self.reset()
 18 |         self.current_state = self.init_state
 19 |         self.rewards, self.group_sizes, self.avg_states, self.avg_actions = self.avg_group()
 20 | 
 21 |     def reset(self):
 22 |         #reset初始状态
 23 |         init_state = np.array(self.data['state_float'].sample(1).values[0]).reshape((1, 12, 30))
 24 |         self.current_state = init_state
 25 |         return init_state
 26 | 
 27 |     def step(self, action):
 28 |         #由于是交互的，所以一轮的用户点击之后，历史的更新是去掉以前的，然后补上新的，作为新状态
 29 |         #移掉第一个item，加入下一个item，就可以构建一个新的推荐列表
 30 |         simulate_rewards, result = self.simulate_reward((self.current_state.reshape((1, 360)),
 31 |                                                          action.reshape((1, 120))))
 32 |         for i, r in enumerate(simulate_rewards.split('|')):
 33 |             if r != "show": #show动作的reward是0
 34 |                 # self.current_state.append(action[i])
 35 |                 tmp = np.append(self.current_state[0], action[i].reshape((1, 30)), axis=0)
 36 |                 tmp = np.delete(tmp, 0, axis=0)
 37 |                 self.current_state = tmp[np.newaxis, :]#得到下一个状态
 38 |         return result, self.current_state
 39 | 
 40 |     def avg_group(self):
 41 |         """计算一组（按照历史奖励序列分组）的平均value以得到获得每个奖励序列的可能性"""
 42 |         rewards = list()
 43 |         avg_states = list()
 44 |         avg_actions = list()
 45 |         group_sizes = list()
 46 |         for reward, group in self.data.groupby(['reward']):#按照reward进行分组
 47 |             n_size = group.shape[0]
 48 |             state_values = group['state_float'].values.tolist()
 49 |             action_values = group['action_float'].values.tolist()
 50 |             #求范数，计算生成的(state，action)对和历史(state，action)对的cosine相似度。
 51 |             avg_states.append(
 52 |                 np.sum(state_values / np.linalg.norm(state_values, 2, axis=1)[:, np.newaxis], axis=0) / n_size
 53 |             )#平均的状态
 54 |             avg_actions.append(
 55 |                 np.sum(action_values / np.linalg.norm(action_values, 2, axis=1)[:, np.newaxis], axis=0) / n_size
 56 |             )#平均的动作
 57 |             group_sizes.append(n_size)#加入到列表中
 58 |             rewards.append(reward)
 59 |         return rewards, group_sizes, avg_states, avg_actions
 60 | 
 61 |     def simulate_reward(self, pair):
 62 |         """使用平均值作为模拟的reward
 63 |         Args:
 64 |             pair (tuple): <state, action> pair
 65 |         Returns:
 66 |             simulated reward for the pair.
 67 |         """
 68 |         probability = list()
 69 |         denominator = 0.
 70 |         max_prob = 0.
 71 |         result = 0.
 72 |         simulate_rewards = ""
 73 |         #换种方式计算reward
 74 |         for s, a, r in zip(self.avg_states, self.avg_actions, self.rewards):
 75 |             #同样是求cosine相似度，求完之后，要计算这个列表所有排列组合的概率
 76 |             numerator = self.alpha * (
 77 |                 np.dot(pair[0], s)[0] / (np.linalg.norm(pair[0], 2) * np.linalg.norm(s, 2))
 78 |             ) + (1 - self.alpha) * (
 79 |                 np.dot(pair[1], a)[0] / (np.linalg.norm((pair[1], 2) * np.linalg.norm(a, 2)))
 80 |             )
 81 |             probability.append(numerator)#结果会多乘一个概率p
 82 |             denominator += numerator #计数便于计算一个和为1的概率
 83 |             if numerator > max_prob:
 84 |                 max_prob = numerator
 85 |                 simulate_rewards = r
 86 |         probability /= denominator #概率p
 87 |         for p, r in zip(probability, self.rewards):
 88 |             for k, reward in enumerate(r.split('|')):
 89 |                 result += p * np.power(self.sigma, k) * (0 if reward == "show" else 1) #show的reward为0
 90 | 
 91 |         # calculate simulated reward by group
 92 |         # for i, reward in enumerate(self.rewards):
 93 |         #     numerator = self.group_sizes[i] * (
 94 |         #             self.alpha * (np.dot(pair[0], self.avg_states[i])[0] / np.linalg.norm(pair[0], 2)) +
 95 |         #             (1 - self.alpha) * (np.dot(pair[1], self.avg_actions[i]) / np.linalg.norm(pair[1], 2))
 96 |         #     )
 97 |         #     probability.append(numerator)
 98 |         #     denominator += numerator
 99 |         # probability /= denominator
100 |         # # max probability
101 |         # simulate_rewards = self.rewards[int(np.argmax(probability))]
102 | 
103 |         # calculate simulated reward in normal way
104 |         # for idx, row in data.iterrows():
105 |         #     state_values = row['state_float']
106 |         #     action_values = row['action_float']
107 |         #     numerator = self.alpha * (
108 |         #             np.dot(pair[0], state_values)[0] / (np.linalg.norm(pair[0], 2) * np.linalg.norm(state_values, 2))
109 |         #     ) + (1 - self.alpha) * (
110 |         #             np.dot(pair[1], action_values)[0] / (np.linalg.norm(pair[1], 2) * np.linalg.norm(action_values, 2))
111 |         #     )
112 |         #     probability.append(numerator)
113 |         #     denominator += numerator
114 |         # probability /= denominator
115 |         # simulate_rewards = data.iloc[int(np.argmax(probability))]['reward']
116 | 
117 |         # for k, reward in enumerate(simulate_rewards.split('|')):
118 |         #     result += np.power(self.sigma, k) * (0 if reward == "show" else 1)
119 |         return simulate_rewards, result
120 | 


--------------------------------------------------------------------------------
/DeepWalk/README.md:
--------------------------------------------------------------------------------
 1 | # DeepWalk: Online Learning of Social Representations(DeepWalk)
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | blog解读：https://blog.csdn.net/qq_39388410/article/details/103859078
 5 | 
 6 | #
 7 | 
 8 | 原paper：Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations (KDD 2014)
 9 | 
10 | 浅梦大佬即插即玩的开源：https://github.com/shenweichen/GraphEmbedding/
11 | 


--------------------------------------------------------------------------------
/DeepWalk/classify.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | from __future__ import print_function
 6 | 
 7 | 
 8 | import numpy
 9 | from sklearn.metrics import f1_score, accuracy_score
10 | from sklearn.multiclass import OneVsRestClassifier
11 | from sklearn.preprocessing import MultiLabelBinarizer
12 | 
13 | # 分类器是使用sklearn的OneVsRestClassifier处理多分类任务。
14 | # 对n类，会建立n个二分类器，每个分类器针对其中一个类别和剩余类别进行分类。
15 | 
16 | class TopKRanker(OneVsRestClassifier):
17 |     #注意这里OneVsRestClassifier
18 |     def predict(self, X, top_k_list):
19 |         #预测分类概率
20 |         probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
21 |         all_labels = []
22 |         for i, k in enumerate(top_k_list): #对所有Y选择概率最大的类别
23 |             probs_ = probs[i, :]
24 |             labels = self.classes_[probs_.argsort()[-k:]].tolist()#排序得到label
25 |             probs_[:] = 0 #one-hot操作，只有label处为1，其他地方都为0
26 |             probs_[labels] = 1
27 |             all_labels.append(probs_)
28 |         return numpy.asarray(all_labels)
29 | 
30 | 
31 | class Classifier(object):
32 | 
33 |     def __init__(self, embeddings, clf):
34 |         self.embeddings = embeddings
35 |         self.clf = TopKRanker(clf)
36 |         self.binarizer = MultiLabelBinarizer(sparse_output=True)
37 | 
38 |     def train(self, X, Y, Y_all):
39 |         self.binarizer.fit(Y_all)
40 |         X_train = [self.embeddings[x] for x in X]
41 |         Y = self.binarizer.transform(Y) #多标签二值化
42 |         self.clf.fit(X_train, Y) #训练分类器
43 | 
44 |     def evaluate(self, X, Y):
45 |         top_k_list = [len(l) for l in Y]
46 |         Y_ = self.predict(X, top_k_list)#预测一个类别
47 |         Y = self.binarizer.transform(Y) #多标签二值化
48 |         averages = ["micro", "macro", "samples", "weighted"]
49 |         results = {}
50 |         for average in averages: #算F1
51 |             results[average] = f1_score(Y, Y_, average=average)
52 |         results['acc'] = accuracy_score(Y,Y_)
53 |         print('-------------------')
54 |         print(results)
55 |         return results
56 |         print('-------------------')
57 | 
58 |     def predict(self, X, top_k_list):
59 |         X_ = numpy.asarray([self.embeddings[x] for x in X])
60 |         Y = self.clf.predict(X_, top_k_list=top_k_list)
61 |         return Y
62 | 
63 |     def split_train_evaluate(self, X, Y, train_precent, seed=0):
64 |         #设定状态，记录下数组被打乱的操作，以使打乱前后实例与标签的一一对应
65 |         state = numpy.random.get_state() 
66 | 
67 |         training_size = int(train_precent * len(X))
68 |         numpy.random.seed(seed) #固定随机种子便于复现结果
69 |         shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
70 |         X_train = [X[shuffle_indices[i]] for i in range(training_size)]
71 |         Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
72 |         #前80训练，后20测试
73 |         X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
74 |         Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
75 | 
76 |         self.train(X_train, Y_train, Y)
77 |         numpy.random.set_state(state)#恢复打乱前的状态
78 |         return self.evaluate(X_test, Y_test)
79 | 
80 | 
81 | def read_node_label(filename, skip_head=False):
82 |     fin = open(filename, 'r')
83 |     X = []
84 |     Y = []
85 |     while 1:
86 |         if skip_head:
87 |             fin.readline()
88 |         l = fin.readline()
89 |         if l == '':
90 |             break
91 |         vec = l.strip().split(' ')
92 |         X.append(vec[0])
93 |         Y.append(vec[1:])
94 |     fin.close()
95 |     return X, Y
96 | 


--------------------------------------------------------------------------------
/DeepWalk/deepwalk.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | from ..walker import RandomWalker
 6 | from gensim.models import Word2Vec
 7 | import pandas as pd
 8 | 
 9 | #逻辑为先随机游走得到一个“句子”，然后直接拿句子，gensim训练向量就行了。
10 | 
11 | class DeepWalk:
12 |     def __init__(self, graph, walk_length, num_walks, workers=1):
13 | 
14 |         self.graph = graph
15 |         self.w2v_model = None
16 |         self._embeddings = {}
17 | 
18 |         self.walker = RandomWalker(
19 |             graph, p=1, q=1, )
20 |         self.sentences = self.walker.simulate_walks(
21 |             num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
22 | 
23 |     def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
24 |         #设定一些关于gensim的参数
25 |         kwargs["sentences"] = self.sentences
26 |         kwargs["min_count"] = kwargs.get("min_count", 0) #词频阈值，这里句子量很少设为0
27 |         kwargs["size"] = embed_size #最后得到128维的节点向量
28 |         kwargs["sg"] = 1  # skip gram的模式来训练
29 |         kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
30 |         kwargs["workers"] = workers
31 |         kwargs["window"] = window_size
32 |         kwargs["iter"] = iter
33 | 
34 |         print("Learning embedding vectors...")
35 |         model = Word2Vec(**kwargs) #直接用gensim的模型
36 |         print("Learning embedding vectors done!")
37 | 
38 |         self.w2v_model = model
39 |         return model
40 | 
41 |     def get_embeddings(self,):
42 |         #得到训练好后的向量
43 |         if self.w2v_model is None:
44 |             print("model not train")
45 |             return {}
46 | 
47 |         self._embeddings = {}
48 |         for word in self.graph.nodes():#建立一个所有节点的向量索引表
49 |             self._embeddings[word] = self.w2v_model.wv[word]
50 | 
51 |         return self._embeddings
52 | 


--------------------------------------------------------------------------------
/DeepWalk/deepwalk_wiki.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | import numpy as np
 6 | 
 7 | from ge.classify import read_node_label, Classifier
 8 | from ge import DeepWalk
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | import matplotlib.pyplot as plt
12 | import networkx as nx
13 | from sklearn.manifold import TSNE
14 | 
15 | #networkx是专门用来存储图，构建图和分析图的库，操作真的超级方便。
16 | 
17 | def evaluate_embeddings(embeddings):
18 |     #读入真实的分类label
19 |     X, Y = read_node_label('../data/wiki/wiki_labels.txt')
20 |     tr_frac = 0.8 #80%的节点用于训练分类器，其余的用于测试
21 |     print("Training classifier using {:.2f}% nodes...".format(
22 |         tr_frac * 100))
23 |     #应用分类器对节点进行分类以评估向量的质量
24 |     clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
25 |     clf.split_train_evaluate(X, Y, tr_frac)
26 | 
27 | 
28 | def plot_embeddings(embeddings,):
29 |     X, Y = read_node_label('../data/wiki/wiki_labels.txt')
30 | 
31 |     emb_list = []
32 |     for k in X:
33 |         emb_list.append(embeddings[k])
34 |     emb_list = np.array(emb_list)
35 | 
36 |     model = TSNE(n_components=2)#用TSNE进行降维
37 |     node_pos = model.fit_transform(emb_list)
38 | 
39 |     color_idx = {}
40 |     for i in range(len(X)): 
41 |         color_idx.setdefault(Y[i][0], []) #类别
42 |         color_idx[Y[i][0]].append(i) #id
43 | 
44 |     for c, idx in color_idx.items(): #不同类别不同颜色
45 |         plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
46 |     plt.legend()
47 |     plt.show()
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     #读入边列表，文件中的每一行有两个节点，表示连接这两个节点的边。
52 |     #直接用networkx读入就行，很方便的操作。
53 |     G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
54 |                          create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
55 | 
56 |     #实例化模型，“句子”长度为10，80次游走。
57 |     model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
58 |     model.train(window_size=5, iter=3) #训练模型，关于gensim w2v的参数都默认在train里面
59 |     embeddings = model.get_embeddings() #得到Embedding向量
60 | 
61 |     evaluate_embeddings(embeddings) #应用节点分类来评估嵌入向量的质量
62 |     plot_embeddings(embeddings) #降成二维画在图中可视化
63 | 


--------------------------------------------------------------------------------
/GAE/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Auto-Encoders (GAE)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/107895198
 6 | 
 7 | #
 8 | 
 9 | 原code：pytorch_geomatric
10 | 


--------------------------------------------------------------------------------
/GAE/pytorch_geometric_gae.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Feb 7, 2021
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | #GAE思路比较简单，大概就是用中间隐特征z来重建Graph，具体可以看博文，不赘述了。
  7 | 
  8 | 
  9 | import torch
 10 | from sklearn.metrics import roc_auc_score, average_precision_score
 11 | from torch_geometric.utils import (negative_sampling, remove_self_loops,
 12 |                                    add_self_loops)
 13 | 
 14 | from ..inits import reset
 15 | 
 16 | EPS = 1e-15 #预测概率的控制值，以免求log的时候有问题
 17 | MAX_LOGSTD = 10
 18 | 
 19 | 
 20 | class InnerProductDecoder(torch.nn.Module):
 21 |     r"""这内积解码器，即将隐层表示Z内积之后来重建原来的Graph
 22 |     值得注意的，有两个forward可以分别hold住全部重建和只对局部采样重建"""
 23 |     def forward(self, z, edge_index, sigmoid=True):
 24 |         #计算节点对之间存在边的概率
 25 |         #edge_index分别存的邻接矩阵的行和列，所以取0和1直接可计算
 26 |         value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)
 27 |         
 28 |         #Sigmoid控制是否非线性
 29 |         return torch.sigmoid(value) if sigmoid else value
 30 | 
 31 | 
 32 |     def forward_all(self, z, sigmoid=True):
 33 |         #计算所有节点，所以是按照公式直接内积
 34 |         adj = torch.matmul(z, z.t())
 35 | 
 36 |         #Sigmoid控制是否非线性
 37 |         return torch.sigmoid(adj) if sigmoid else adj
 38 | 
 39 | 
 40 | 
 41 | class GAE(torch.nn.Module):
 42 |     r"""GAE的代码"""
 43 |     def __init__(self, encoder, decoder=None):
 44 |         super(GAE, self).__init__()
 45 |         self.encoder = encoder #这里encoder的设置可以就是普通的GCN或者其他模型
 46 |         self.decoder = InnerProductDecoder() if decoder is None else decoder #decoder是上面的class
 47 |         GAE.reset_parameters(self)
 48 | 
 49 |     def reset_parameters(self):
 50 |         reset(self.encoder)
 51 |         reset(self.decoder)
 52 | 
 53 | 
 54 |     def encode(self, *args, **kwargs):
 55 |         #这里可以就放入GCN来得到z
 56 |         return self.encoder(*args, **kwargs)
 57 | 
 58 | 
 59 |     def decode(self, *args, **kwargs):
 60 |         #根据z计算边概率来重建Graph
 61 |         return self.decoder(*args, **kwargs)
 62 | 
 63 | 
 64 |     def recon_loss(self, z, pos_edge_index, neg_edge_index=None):
 65 |         r"""计算重建损失，这里会使用正负例采样来计算交叉熵"""
 66 | 
 67 |         #求正例的重建分数，这里看decoder的输出可以知道调用的是采样版的forward
 68 |         pos_loss = -torch.log(
 69 |             self.decoder(z, pos_edge_index, sigmoid=True) + EPS).mean()
 70 | 
 71 |         #负例中不含自环（self-loops），所以先添进去方便后面负采样
 72 |         pos_edge_index, _ = remove_self_loops(pos_edge_index)
 73 |         pos_edge_index, _ = add_self_loops(pos_edge_index)
 74 |         
 75 |         if neg_edge_index is None: #负采样得到负例的index
 76 |             neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
 77 | 
 78 |         #计算负例的重建分数
 79 |         neg_loss = -torch.log(1 -
 80 |                               self.decoder(z, neg_edge_index, sigmoid=True) +
 81 |                               EPS).mean()
 82 | 
 83 |         return pos_loss + neg_loss #加和得到总loss
 84 | 
 85 | 
 86 |     def test(self, z, pos_edge_index, neg_edge_index):
 87 |         r"""Given latent variables :obj:`z`, positive edges
 88 |         :obj:`pos_edge_index` and negative edges :obj:`neg_edge_index`,
 89 |         computes area under the ROC curve (AUC) and average precision (AP)
 90 |         scores.
 91 | 
 92 |         Args:
 93 |             z (Tensor): The latent space :math:`\mathbf{Z}`.
 94 |             pos_edge_index (LongTensor): The positive edges to evaluate
 95 |                 against.
 96 |             neg_edge_index (LongTensor): The negative edges to evaluate
 97 |                 against.
 98 |         """
 99 |         pos_y = z.new_ones(pos_edge_index.size(1)) #正例的y为1
100 |         neg_y = z.new_zeros(neg_edge_index.size(1)) #负例的y为0
101 |         y = torch.cat([pos_y, neg_y], dim=0) #这是真实标签
102 | 
103 |         #得到GAE的预测结果
104 |         pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
105 |         neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
106 |         pred = torch.cat([pos_pred, neg_pred], dim=0)
107 | 
108 |         y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy() #detach出来算指标
109 | 
110 |         return roc_auc_score(y, pred), average_precision_score(y, pred) #计算AUC和AP
111 | 


--------------------------------------------------------------------------------
/GAT/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Attention Networks(GAT)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/103903631
 6 | 
 7 | 建议阅读顺序： utils.py-->layers.py-->models.py-->train.py
 8 | 
 9 | # 
10 | 
11 | 原paper： 
12 | ```
13 | @article{
14 |   velickovic2018graph,
15 |   title="{Graph Attention Networks}",
16 |   author={Veli{\v{c}}kovi{\'{c}}, Petar and Cucurull, Guillem and Casanova, Arantxa and Romero, Adriana and Li{\`{o}}, Pietro and Bengio, Yoshua},
17 |   journal={International Conference on Learning Representations},
18 |   year={2018},
19 |   url={https://openreview.net/forum?id=rJXMpikCZ},
20 |   note={accepted as poster},
21 | }
22 | ```
23 | 
24 | 原code： https://github.com/PetarV-/GAT
25 | 


--------------------------------------------------------------------------------
/GAT/models.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 26, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from layers import GraphAttentionLayer, SpGraphAttentionLayer
10 | 
11 | 
12 | class GAT(nn.Module):
13 |     def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
14 |         """密集版本的GAT."""
15 |         #nfeat，nhid, nclass是输入层，特征，类别的数目
16 |         #alpha控制leaky的斜率，实现代码主要在layers.py中
17 |         super(GAT, self).__init__()
18 |         self.dropout = dropout
19 | 
20 |         #直接调用多头的Attention衡量node领域
21 |         self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
22 |         for i, attention in enumerate(self.attentions):
23 |             self.add_module('attention_{}'.format(i), attention)
24 |         #还需要一个Attention衡量多头，得到分类结果
25 |         self.out_att = GraphAttentionLayer(nhid * nheads, nclass, dropout=dropout, alpha=alpha, concat=False)
26 | 
27 |     def forward(self, x, adj):#输入特征和邻接矩阵
28 |         x = F.dropout(x, self.dropout, training=self.training)
29 |         x = torch.cat([att(x, adj) for att in self.attentions], dim=1)#concat所有头
30 |         x = F.dropout(x, self.dropout, training=self.training)
31 |         x = F.elu(self.out_att(x, adj))#ELU激活
32 |         return F.log_softmax(x, dim=1)
33 | 
34 | 
35 | class SpGAT(nn.Module):
36 |     def __init__(self, nfeat, nhid, nclass, dropout, alpha, nheads):
37 |         """稀疏版本的GAT."""
38 |         #和密集GAT差不多，计算稀疏注意力的代码也在layers.py中。思路一致。
39 |         super(SpGAT, self).__init__()
40 |         self.dropout = dropout
41 | 
42 |         self.attentions = [SpGraphAttentionLayer(nfeat, 
43 |                                                  nhid, 
44 |                                                  dropout=dropout, 
45 |                                                  alpha=alpha, 
46 |                                                  concat=True) for _ in range(nheads)]
47 |         for i, attention in enumerate(self.attentions):
48 |             self.add_module('attention_{}'.format(i), attention)
49 | 
50 |         self.out_att = SpGraphAttentionLayer(nhid * nheads, 
51 |                                              nclass, 
52 |                                              dropout=dropout, 
53 |                                              alpha=alpha, 
54 |                                              concat=False)
55 | 
56 |     def forward(self, x, adj):
57 |         x = F.dropout(x, self.dropout, training=self.training)
58 |         x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
59 |         x = F.dropout(x, self.dropout, training=self.training)
60 |         x = F.elu(self.out_att(x, adj))
61 |         return F.log_softmax(x, dim=1)
62 | 


--------------------------------------------------------------------------------
/GAT/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 26, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import numpy as np
 7 | import scipy.sparse as sp
 8 | import torch
 9 | 
10 | #scipy.sparse库中提供了多种表示稀疏矩阵的格式，同时支持稀疏矩阵的加、减、乘、除和幂等。
11 | 
12 | 
13 | def encode_onehot(labels):
14 |     #标签one-hot
15 |     classes = set(labels)
16 |     #去重标签，cora数据集的维度0是id，1-1433是节点特征向量，1434是label
17 |     #class为键，值是先创建一个对角为1的矩阵取其第i行（其实就是在对应的位置为1了）。
18 |     classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
19 |     labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
20 |     return labels_onehot
21 | 
22 | 
23 | def load_data(path="./data/cora/", dataset="cora"):
24 |     """
25 |     载入标准network数据集。
26 |     cora是2707个有引用关系的一堆论文，利用GAT用半监督的方式进行7分类（该论文的领域）。
27 |     """
28 |     print('Loading {} dataset...'.format(dataset))
29 | 
30 |     #节点，#从文件中生成数组
31 |     idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
32 |     #csr(压缩稀疏行矩阵)，即用行索引、列索引和值压缩。
33 |     features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
34 |     labels = encode_onehot(idx_features_labels[:, -1]) #one-hot label
35 | 
36 |     #----------构建大图Graph----------------
37 |     idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
38 |     idx_map = {j: i for i, j in enumerate(idx)} #从样本id到样本索引
39 |     #边信息。即cite引用关系。
40 |     edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
41 |     edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
42 |     #邻接矩阵。但是sp稀疏矩阵的表示，所以非对称。
43 |     adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)
44 | 
45 |     #构建对称的邻接矩阵
46 |     adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
47 | 
48 |     features = normalize_features(features)#非对称归一化特征
49 |     adj = normalize_adj(adj + sp.eye(adj.shape[0]))#对称归一化邻接矩阵
50 | 
51 | 
52 |     #划分数据集，按索引将数据集划分为训练集，验证集和测试集。
53 |     idx_train = range(140) #范围是[0,140),前面是True，后面是False
54 |     idx_val = range(200, 500) #同上
55 |     idx_test = range(500, 1500) #同上
56 | 
57 |     #转换为pytorch下的tensor
58 |     adj = torch.FloatTensor(np.array(adj.todense()))
59 |     features = torch.FloatTensor(np.array(features.todense()))
60 |     labels = torch.LongTensor(np.where(labels)[1])
61 | 
62 |     idx_train = torch.LongTensor(idx_train)
63 |     idx_val = torch.LongTensor(idx_val)
64 |     idx_test = torch.LongTensor(idx_test)
65 | 
66 |     return adj, features, labels, idx_train, idx_val, idx_test
67 | 
68 | 
69 | def normalize_adj(mx):
70 |     """对称归一化，Row-normalize sparse matrix"""
71 |     #D^(-1/2) * A * D^(-1/2)
72 |     rowsum = np.array(mx.sum(1)) #adj.sum(1)计算每行元素和，得到度矩阵再D^-0.5
73 |     r_inv_sqrt = np.power(rowsum, -0.5).flatten()
74 |     r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
75 |     r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
76 |     return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)#得到归一化的A_norm
77 | 
78 | 
79 | def normalize_features(mx):
80 |     """非对称归一化，Row-normalize sparse matrix"""
81 |     #D^(-1) * A
82 |     rowsum = np.array(mx.sum(1))
83 |     r_inv = np.power(rowsum, -1).flatten()
84 |     r_inv[np.isinf(r_inv)] = 0.
85 |     r_mat_inv = sp.diags(r_inv)
86 |     mx = r_mat_inv.dot(mx)
87 |     return mx
88 | 
89 | 
90 | def accuracy(output, labels):
91 |     #准确率评估函数
92 |     preds = output.max(1)[1].type_as(labels)
93 |     correct = preds.eq(labels).double()
94 |     correct = correct.sum()
95 |     return correct / len(labels)
96 | 


--------------------------------------------------------------------------------
/GCN/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Convolutional Networks(GCN)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/102730998
 6 | 
 7 | 建议阅读顺序： keras_setup.py-->keras_utils.py-->keras_graph.py-->keras_train.py-->pytorch_geometric_gcn.py
 8 | 
 9 | #
10 | 
11 | 原paper： Thomas N. Kipf, Max Welling, Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017)
12 | 
13 | keras code： https://github.com/tkipf/keras-gcn
14 | 


--------------------------------------------------------------------------------
/GCN/keras_graph.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 25, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | from __future__ import print_function
  7 | #future处理新功能版本不兼容问题，加上这句话，所有的print函数将是3.x的模式（即便环境是2.x）
  8 | 
  9 | from keras import activations, initializers, constraints
 10 | from keras import regularizers
 11 | from keras.engine import Layer
 12 | import keras.backend as K
 13 | 
 14 | 
 15 | class GraphConvolution(Layer):
 16 |     """Basic graph convolution layer as in https://arxiv.org/abs/1609.02907"""
 17 |     #基础图卷积，这似乎只是个基类。作者直接在train.py里面定义了两层的GCN模型。
 18 |     #不过更优雅的写法是重写这个类。
 19 |     def __init__(self, units, support=1,
 20 |                  activation=None,
 21 |                  use_bias=True,
 22 |                  kernel_initializer='glorot_uniform',
 23 |                  bias_initializer='zeros',
 24 |                  kernel_regularizer=None,
 25 |                  bias_regularizer=None,
 26 |                  activity_regularizer=None,
 27 |                  kernel_constraint=None,
 28 |                  bias_constraint=None,
 29 |                  **kwargs):
 30 |         if 'input_shape' not in kwargs and 'input_dim' in kwargs:
 31 |             kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 32 |         super(GraphConvolution, self).__init__(**kwargs)
 33 |         self.units = units
 34 |         self.activation = activations.get(activation)
 35 |         self.use_bias = use_bias
 36 |         self.kernel_initializer = initializers.get(kernel_initializer)
 37 |         self.bias_initializer = initializers.get(bias_initializer)
 38 |         self.kernel_regularizer = regularizers.get(kernel_regularizer)
 39 |         self.bias_regularizer = regularizers.get(bias_regularizer)
 40 |         self.activity_regularizer = regularizers.get(activity_regularizer)
 41 |         self.kernel_constraint = constraints.get(kernel_constraint)
 42 |         self.bias_constraint = constraints.get(bias_constraint)
 43 |         self.supports_masking = True
 44 | 
 45 |         self.support = support
 46 |         assert support >= 1
 47 | 
 48 |     def compute_output_shape(self, input_shapes):
 49 |         #计算输出的形状，方便keras能知道各层的shape
 50 |         features_shape = input_shapes[0]
 51 |         output_shape = (features_shape[0], self.units)
 52 |         return output_shape  # (batch_size, output_dim)
 53 | 
 54 |     def build(self, input_shapes):
 55 |         #定义参数
 56 |         features_shape = input_shapes[0]
 57 |         assert len(features_shape) == 2
 58 |         input_dim = features_shape[1]
 59 | 
 60 |         self.kernel = self.add_weight(shape=(input_dim * self.support,
 61 |                                              self.units),
 62 |                                       initializer=self.kernel_initializer,
 63 |                                       name='kernel',
 64 |                                       regularizer=self.kernel_regularizer,
 65 |                                       constraint=self.kernel_constraint)
 66 |         if self.use_bias:
 67 |             self.bias = self.add_weight(shape=(self.units,),
 68 |                                         initializer=self.bias_initializer,
 69 |                                         name='bias',
 70 |                                         regularizer=self.bias_regularizer,
 71 |                                         constraint=self.bias_constraint)
 72 |         else:
 73 |             self.bias = None
 74 |         self.built = True
 75 | 
 76 |     def call(self, inputs, mask=None):
 77 |         #__call__ 的作用让实例成为可调用对象
 78 |         #GCN的模型逻辑，其实只要计算好几个矩阵了就很简单了。
 79 |         #计算A、L、D等等的函数主要在utils.py实现，训练逻辑在train.py
 80 |          
 81 |         features = inputs[0] #input的第1维是特征
 82 |         basis = inputs[1:] #后面是归一化过的邻接矩阵
 83 | 
 84 |         supports = list()#support是邻接矩阵的归一化形式
 85 |         for i in range(self.support):
 86 |             supports.append(K.dot(basis[i], features))#A*X
 87 |         supports = K.concatenate(supports, axis=1)
 88 |         output = K.dot(supports, self.kernel)#A*X*W
 89 | 
 90 |         if self.bias:
 91 |             output += self.bias
 92 |         return self.activation(output)
 93 | 
 94 |     def get_config(self):
 95 |         config = {'units': self.units,
 96 |                   'support': self.support,
 97 |                   'activation': activations.serialize(self.activation),
 98 |                   'use_bias': self.use_bias,
 99 |                   'kernel_initializer': initializers.serialize(
100 |                       self.kernel_initializer),
101 |                   'bias_initializer': initializers.serialize(
102 |                       self.bias_initializer),
103 |                   'kernel_regularizer': regularizers.serialize(
104 |                       self.kernel_regularizer),
105 |                   'bias_regularizer': regularizers.serialize(
106 |                       self.bias_regularizer),
107 |                   'activity_regularizer': regularizers.serialize(
108 |                       self.activity_regularizer),
109 |                   'kernel_constraint': constraints.serialize(
110 |                       self.kernel_constraint),
111 |                   'bias_constraint': constraints.serialize(self.bias_constraint)
112 |         }
113 | 
114 |         base_config = super(GraphConvolution, self).get_config()
115 |         return dict(list(base_config.items()) + list(config.items()))
116 | 


--------------------------------------------------------------------------------
/GCN/keras_setup.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 25, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | from setuptools import setup
 7 | from setuptools import find_packages
 8 | 
 9 | #Setuptools是用于编译、分发和安装 python 包的一个工具
10 | #特别是在包依赖问题场景下非常有用，它是一个强大的包管理工具
11 | 
12 | '''
13 | setup.py适合一键打包安装
14 | setup函数包括：
15 | 
16 | --name 包名称：生成的egg名称
17 | --version包版本：生成egg包的版本号
18 | --description 程序的简单描述：这个是keras版本的GCN
19 | --author 作者
20 | --author_email 作者的邮箱地址
21 | --url 程序的官网地址
22 | --download_url 程序的下载地址，如果有
23 | --license 程序的授权信息：MIT....
24 | --install_requires 安装依赖。主要是keras
25 | --extras_require 其他的依赖。导入数据需要的json和h5py模块
26 | --package_data 告诉setuptools哪些目录下的文件被映射到哪个源码包，都在README目录中。
27 | --find_packages() 和setup.py同一目录下搜索各个含有 init.py的包，用于增加packages参数。
28 | 
29 | '''
30 | 
31 | setup(name='kegra',
32 |       version='0.0.1',
33 |       description='Deep Learning on Graphs with Keras',
34 |       author='Thomas Kipf',
35 |       author_email='thomas.kipf@gmail.com',
36 |       url='https://tkipf.github.io',
37 |       download_url='...',
38 |       license='MIT',
39 |       install_requires=['keras'],
40 |       extras_require={
41 |           'model_saving': ['json', 'h5py'],
42 |       },
43 |       package_data={'kegra': ['README.md']},
44 |       packages=find_packages())
45 | 


--------------------------------------------------------------------------------
/GCN/keras_train.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 25, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | from __future__ import print_function
  7 | #future处理新功能版本不兼容问题，加上这句话，所有的print函数将是3.x的模式（即便环境是2.x）
  8 | 
  9 | from keras.layers import Input, Dropout
 10 | from keras.models import Model
 11 | from keras.optimizers import Adam
 12 | from keras.regularizers import l2
 13 | 
 14 | from kegra.layers.graph import GraphConvolution
 15 | from kegra.utils import *
 16 | 
 17 | import time
 18 | 
 19 | #定义超参数
 20 | DATASET = 'cora' #数据集，整个模型的目的是对其进行有引用关系的一堆论文做文本分类
 21 | FILTER = 'localpool'  # 过滤器是局部池化localpool或者'chebyshev'
 22 | MAX_DEGREE = 2  # 最大多项式的度
 23 | SYM_NORM = True  # 是否对称正则化
 24 | NB_EPOCH = 200 #迭代次数
 25 | PATIENCE = 10  # 早停次数（10次不变就早停）
 26 | 
 27 | #得到训练集，验证集和测试集。
 28 | X, A, y = load_data(dataset=DATASET)
 29 | y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)
 30 | 
 31 | #特征归一化
 32 | X /= X.sum(1).reshape(-1, 1)
 33 | 
 34 | if FILTER == 'localpool':#如果是局部池化
 35 |     """ Local pooling filters (see 'renormalization trick' in Kipf & Welling, arXiv 2016) """
 36 |     print('Using local pooling filters...')
 37 |     A_ = preprocess_adj(A, SYM_NORM) #处理有自环的邻接矩阵
 38 |     support = 1
 39 |     graph = [X, A_] #特征矩阵和邻接矩阵
 40 |     G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]
 41 | 
 42 | elif FILTER == 'chebyshev':#如果是切比雪夫多项式
 43 |     """ Chebyshev polynomial basis filters (Defferard et al., NIPS 2016)  """
 44 |     print('Using Chebyshev polynomial basis filters...')
 45 |     L = normalized_laplacian(A, SYM_NORM) #归一化后的拉普拉斯矩阵
 46 |     L_scaled = rescale_laplacian(L) #重调整以简化
 47 |     T_k = chebyshev_polynomial(L_scaled, MAX_DEGREE) #计算到max_degree阶的切比雪夫
 48 |     support = MAX_DEGREE + 1 #support是邻接矩阵的归一化形式
 49 |     graph = [X]+T_k 
 50 |     G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True) for _ in range(support)]
 51 | 
 52 | else:
 53 |     raise Exception('Invalid filter type.')
 54 | 
 55 | 
 56 | 
 57 | #定义模型架构。用GCN的参数作为张量列表传递。
 58 | #两层GCN
 59 | X_in = Input(shape=(X.shape[1],))
 60 | 
 61 | H = Dropout(0.5)(X_in)#输入维度是1433
 62 | H = GraphConvolution(16, support, activation='relu', kernel_regularizer=l2(5e-4))([H]+G)
 63 | H = Dropout(0.5)(H)#隐层是16
 64 | #最后是要预测y的维度，这个任务是论文分类7个类别，所以是7个维度的softmax
 65 | Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G)
 66 | 
 67 | #编译模型
 68 | model = Model(inputs=[X_in]+G, outputs=Y)
 69 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))
 70 | 
 71 | #辅助变量
 72 | wait = 0
 73 | preds = None
 74 | best_val_loss = 99999
 75 | 
 76 | #开始训练
 77 | for epoch in range(1, NB_EPOCH+1):
 78 |     t = time.time()
 79 | 
 80 |     #用被mask设为0了的node计算loss训练模型
 81 |     model.fit(graph, y_train, sample_weight=train_mask,
 82 |               batch_size=A.shape[0], epochs=1, shuffle=False, verbose=0)
 83 | 
 84 |     #预测是在所有数据上的结果
 85 |     preds = model.predict(graph, batch_size=A.shape[0])
 86 | 
 87 |     #模型在验证集上的表现
 88 |     train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
 89 |                                                    [idx_train, idx_val])
 90 |     print("Epoch: {:04d}".format(epoch),
 91 |           "train_loss= {:.4f}".format(train_val_loss[0]),
 92 |           "train_acc= {:.4f}".format(train_val_acc[0]),
 93 |           "val_loss= {:.4f}".format(train_val_loss[1]),
 94 |           "val_acc= {:.4f}".format(train_val_acc[1]),
 95 |           "time= {:.4f}".format(time.time() - t))
 96 | 
 97 |     #早停设置
 98 |     if train_val_loss[1] < best_val_loss:
 99 |         best_val_loss = train_val_loss[1]
100 |         wait = 0
101 |     else:
102 |         if wait >= PATIENCE:
103 |             print('Epoch {}: early stopping'.format(epoch))
104 |             break
105 |         wait += 1
106 | 
107 | #模型在测试集上的表现
108 | test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
109 | print("Test set results:",
110 |       "loss= {:.4f}".format(test_loss[0]),
111 |       "accuracy= {:.4f}".format(test_acc[0]))
112 | 


--------------------------------------------------------------------------------
/GCN/pytorch_geometric_gcn.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 25, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import torch
  7 | 
  8 | #torch_geometric这个库似乎很好用。
  9 | #这个py文件是一个简易的调库笔记，并且可以快速实验keras版本的cora任务。
 10 | 
 11 | 
 12 | from torch_geometric.data import Data
 13 | #####data类可以轻松创建Graph
 14 | 
 15 | #边索引是COO格式，仅仅用于创建邻接矩阵。
 16 | #下面的意思是节点0和1相连，1和0相连，1和2相连，2和1相连
 17 | edge_index = torch.tensor([[0, 1, 1, 2],
 18 |                            [1, 0, 2, 1]], dtype=torch.long)
 19 | #创建3个节点
 20 | x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
 21 | 
 22 | #通过Data类来联系起节点边
 23 | data = Data(x=x, edge_index=edge_index)
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | from torch_geometric.datasets import TUDataset
 30 | ####和torchvision很像，这个也有很多的数据集
 31 | 
 32 | dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
 33 | print(len(dataset),dataset.num_classes,dataset.num_node_features)
 34 | 
 35 | 
 36 | from torch_geometric.data import DataLoader
 37 | ####同样也有dataloader
 38 | loader = DataLoader(dataset, batch_size=32, shuffle=True)
 39 | 
 40 | for batch in loader:
 41 |     print(batch)
 42 | 
 43 | #大多数都和pytorch本身的语法很像。
 44 | 
 45 | 
 46 | 
 47 |     
 48 | 
 49 | #-------------调库大法--------------
 50 | #用这个库快速实现keras版本的任务
 51 |     
 52 | import torch
 53 | import torch.nn.functional as F
 54 | from torch_geometric.nn import GCNConv #直接调GCN
 55 | 
 56 | from torch_geometric.datasets import Planetoid
 57 | 
 58 | #cora数据集直接调用，任务是做半监督论文分类
 59 | dataset = Planetoid(root='./Cora', name='Cora')
 60 | 
 61 | class Net(torch.nn.Module):
 62 |     def __init__(self):
 63 |         super(Net, self).__init__()
 64 |         #这个和keras的那个模型结构是一样的，16维的隐层，2个GCN
 65 |         self.conv1 = GCNConv(dataset.num_node_features, 16)
 66 |         self.conv2 = GCNConv(16, dataset.num_classes)
 67 | 
 68 |     def forward(self, data):
 69 |         x, edge_index = data.x, data.edge_index
 70 | 
 71 |         x = self.conv1(x, edge_index)
 72 |         x = F.relu(x) #ReLU激活
 73 |         x = F.dropout(x, training=self.training)
 74 |         x = self.conv2(x, edge_index)
 75 | 
 76 |         return F.log_softmax(x, dim=1) #最后做个7分类
 77 | 
 78 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 79 | model = Net().to(device) #gpu加速
 80 | data = dataset[0].to(device)
 81 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)#优化器
 82 | 
 83 | model.train() #训练模式
 84 | for epoch in range(200):
 85 |     optimizer.zero_grad() #梯度清零
 86 |     out = model(data) #用模型预测结果
 87 |     loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
 88 |     loss.backward() #反向传播
 89 |     optimizer.step() #更新参数
 90 | 
 91 | model.eval() #评估模式
 92 | _, pred = model(data).max(dim=1)
 93 | correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
 94 | acc = correct / data.test_mask.sum().item() #计算acc
 95 | print('Accuracy: {:.4f}'.format(acc))
 96 | 
 97 | 
 98 | #acc结果直接0.8+
 99 | #调库大法好....
100 | 
101 | 
102 | 
103 | 
104 | #不过还是看看GCNConv的源码吧
105 | from torch_geometric.nn import MessagePassing
106 | from torch_geometric.utils import add_self_loops, degree
107 | 
108 | class GCNConv(MessagePassing):
109 |     def __init__(self, in_channels, out_channels):
110 |         super(GCNConv, self).__init__(aggr='add')  # "Add" aggregation.
111 |         self.lin = torch.nn.Linear(in_channels, out_channels)
112 | 
113 |     def forward(self, x, edge_index):
114 |         # x has shape [N, in_channels]，in是node的特征维度
115 |         # edge_index has shape [2, E]，边是两个node之间的关系，所以是2，E是边个数
116 | 
117 |         #第1步：把自环I加到邻接矩阵
118 |         edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
119 | 
120 |         #第2步：对节点node特征矩阵线性转换（linear嵌入投影一下）
121 |         x = self.lin(x)
122 | 
123 |         #第3步：计算正则化矩阵norm
124 |         row, col = edge_index
125 |         deg = degree(row, x.size(0), dtype=x.dtype)#计算度
126 |         deg_inv_sqrt = deg.pow(-0.5) #度的-0.5
127 |         norm = deg_inv_sqrt[row] * deg_inv_sqrt[col] #然后乘到邻接矩阵，默认无向图的话，行列的度在结果上是一样的
128 | 
129 |         #第4-6步，开始消息传递
130 |         return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x,
131 |                               norm=norm)
132 | 
133 |     def message(self, x_j, norm):
134 |         # x_j has shape [E, out_channels]，out是嵌入的特征维度
135 | 
136 |         #第4步：用norm正则化节点特征
137 |         return norm.view(-1, 1) * x_j
138 | 
139 |     def update(self, aggr_out):
140 |         # aggr_out has shape [N, out_channels]
141 | 
142 |         #第6步：返回新的node特征
143 |         return aggr_out
144 | 
145 | 
146 | #该库海量模型可调....
147 | #https://python.ctolib.com/rusty1s-pytorch_geometric.html
148 | 


--------------------------------------------------------------------------------
/Graph-Transformer/README.md:
--------------------------------------------------------------------------------
1 | # Graph Transformer
2 | 
3 | 逐行源码阅读中文笔记。
4 | 


--------------------------------------------------------------------------------
/Graph-Transformer/graph_transformer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Union, Tuple, Optional
  3 | from torch_geometric.typing import PairTensor, Adj, OptTensor
  4 | 
  5 | import torch
  6 | from torch import Tensor
  7 | import torch.nn.functional as F
  8 | from torch.nn import Linear
  9 | from torch_geometric.nn.conv import MessagePassing
 10 | from torch_geometric.utils import softmax
 11 | 
 12 | # Graph Transformer算是GAT的升级版，注意力部分使用自注意力，有残差跳跃等类似Transformer的结构
 13 | # 这份代码提供了丰富的选择，比如是否有边信息，残差怎么做等等。
 14 | 
 15 | class TransformerConv(MessagePassing):
 16 |     _alpha: OptTensor
 17 | 
 18 |     def __init__(self, in_channels: Union[int, Tuple[int,
 19 |                                                      int]], out_channels: int,
 20 |                  ansfo: int = 1, concat: bool = True, beta: bool = False,
 21 |                  dropout: float = 0., edge_dim: Optional[int] = None,
 22 |                  bias: bool = True, root_weight: bool = True, **kwargs):
 23 |         kwargs.setdefault('aggr', 'add')
 24 |         super(TransformerConv, self).__init__(node_dim=0, **kwargs)
 25 | 
 26 |         self.in_channels = in_channels # 输入维度
 27 |         self.out_channels = out_channels # 输出维度
 28 |         self.me = heads # 注意力头数
 29 |         self.beta = beta and root_weight # 是否有残差跳跃，beta是合并系数
 30 |         self.root_weight = root_weight # 是否残差
 31 |         self.concat = concat # Ture代表多头拼，False是多头平均
 32 |         self.dropout = dropout # 采样邻居时的dropout
 33 |         self.edge_dim = edge_dim # 是否有边特征的维度，默认为None
 34 | 
 35 |         if isinstance(in_channels, int):
 36 |             in_channels = (in_channels, in_channels)
 37 |             
 38 |         # 得到KQV
 39 |         self.lin_key = Linear(in_channels[0], heads * out_channels)
 40 |         self.lin_query = Linear(in_channels[1], heads * out_channels)
 41 |         self.lin_value = Linear(in_channels[0], heads * out_channels)
 42 |         if edge_dim is not None: # 如果有边特征，边特征要加入到K中进行运算
 43 |             self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
 44 |         else:
 45 |             self.lin_edge = self.register_parameter('lin_edge', None)
 46 | 
 47 |         if concat:# 两种concat形式
 48 |             self.lin_skip = Linear(in_channels[1], heads * out_channels,
 49 |                                    bias=bias)# 直接拼时需要扩展head维度
 50 |             if self.beta:# 残差合并系数
 51 |                 self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
 52 |             else:
 53 |                 self.lin_beta = self.register_parameter('lin_beta', None)
 54 |         else:
 55 |             self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
 56 |             if self.beta:
 57 |                 self.lin_beta = Linear(3 * out_channels, 1, bias=False)
 58 |             else:
 59 |                 self.lin_beta = self.register_parameter('lin_beta', None)
 60 | 
 61 |         self.reset_parameters()
 62 | 
 63 |     def reset_parameters(self):# 重置参数
 64 |         self.lin_key.reset_parameters()
 65 |         self.lin_query.reset_parameters()
 66 |         self.lin_value.reset_parameters()
 67 |         if self.edge_dim:
 68 |             self.lin_edge.reset_parameters()
 69 |         self.lin_skip.reset_parameters()
 70 |         if self.beta:
 71 |             self.lin_beta.reset_parameters()
 72 | 
 73 | 
 74 |     def forward(self, x: Union[Tensor, PairTensor], edge_index: Adj,
 75 |                 edge_attr: OptTensor = None):
 76 |         """"""
 77 | 
 78 |         if isinstance(x, Tensor):# 保证成对输入
 79 |             x: PairTensor = (x, x)
 80 | 
 81 |         # propagate_type: (x: PairTensor, edge_attr: OptTensor)
 82 |         out = self.propagate(edge_index, x=x, edge_attr=edge_attr, size=None)
 83 | 
 84 |         if self.concat:
 85 |             out = out.view(-1, self.heads * self.out_channels)# 横拼
 86 |         else:
 87 |             out = out.mean(dim=1)# 平均
 88 | 
 89 |         if self.root_weight:# 残差合并
 90 |             x_r = self.lin_skip(x[1])
 91 |             if self.lin_beta is not None:
 92 |                 beta = self.lin_beta(torch.cat([out, x_r, out - x_r], dim=-1))
 93 |                 beta = beta.sigmoid()
 94 |                 out = beta * x_r + (1 - beta) * out
 95 |             else:
 96 |                 out += x_r
 97 | 
 98 |         return out
 99 | 
100 | 
101 |     def message(self, x_i: Tensor, x_j: Tensor, edge_attr: OptTensor,
102 |                 index: Tensor, ptr: OptTensor,
103 |                 size_i: Optional[int]) -> Tensor:
104 | 
105 |         query = self.lin_query(x_i).view(-1, self.heads, self.out_channels)
106 |         key = self.lin_key(x_j).view(-1, self.heads, self.out_channels)
107 | 
108 |         if self.lin_edge is not None:# 如果有边特征，要加到key中
109 |             assert edge_attr is not None
110 |             edge_attr = self.lin_edge(edge_attr).view(-1, self.heads,
111 |                                                       self.out_channels)
112 |             key += edge_attr
113 | 
114 |         # 算注意力
115 |         alpha = (query * key).sum(dim=-1) / math.sqrt(self.out_channels)
116 |         alpha = softmax(alpha, index, ptr, size_i)
117 |         alpha = F.dropout(alpha, p=self.dropout, training=self.training)
118 | 
119 |         out = self.lin_value(x_j).view(-1, self.heads, self.out_channels)
120 |         if edge_attr is not None:
121 |             out += edge_attr
122 | 
123 |         out *= alpha.view(-1, self.heads, 1)
124 |         return out
125 | 
126 |     def __repr__(self):
127 |         return '{}({}, {}, heads={})'.format(self.__class__.__name__,
128 |                                              self.in_channels,
129 |                                              self.out_channels, self.heads)
130 | 


--------------------------------------------------------------------------------
/GraphSAGE/README.md:
--------------------------------------------------------------------------------
 1 | # Representation Learning on Large Graphs(GraphSage)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/103903631
 6 | 
 7 | 建议阅读顺序： encoders.py-->aggregates.py-->models.py
 8 | 
 9 | # 
10 | 
11 | 原paper：
12 | ```
13 |  @inproceedings{hamilton2017inductive,
14 |      author = {Hamilton, William L. and Ying, Rex and Leskovec, Jure},
15 |      title = {Inductive Representation Learning on Large Graphs},
16 |      booktitle = {NIPS},
17 |      year = {2017}
18 |    }
19 | ```
20 | 
21 | 原code： https://github.com/williamleif/GraphSAGE
22 | 


--------------------------------------------------------------------------------
/GraphSAGE/aggregators.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 27, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from torch.autograd import Variable
 9 | 
10 | import random
11 | 
12 | """
13 | Mean 聚合邻居的aggregator。
14 | """
15 | 
16 | class MeanAggregator(nn.Module):
17 |     """
18 |     GraphSage的聚合方式可改，这个py是使用MEAN的GraphSage去encode节点特征。
19 |     这里的节点特征有可能是gcn已经嵌入过的。
20 |     """
21 |     def __init__(self, features, cuda=False, gcn=False): 
22 |         """
23 |         为某个具体的图初始化聚合器aggregator
24 |         features -- 是对node id的嵌入特征
25 |         cuda -- 是否gpu加速
26 |         gcn --- 用GraphSAGE-style聚合，还是用加了自环的GCN-style聚合
27 |         """
28 | 
29 |         super(MeanAggregator, self).__init__()
30 | 
31 |         self.features = features #节点特征
32 |         self.cuda = cuda #gpu
33 |         self.gcn = gcn #gcn聚合
34 |         
35 |     def forward(self, nodes, to_neighs, num_sample=10):
36 |         """
37 |         nodes --- 一个batch的所有node列表
38 |         to_neighs --- node的邻居节点集合
39 |         num_sample --- 对邻居节点的采样数量
40 |         """
41 |         # Local pointers to functions (speed hack)
42 |         _set = set
43 |         if not num_sample is None: #如果设置了采样数
44 |             _sample = random.sample #按采样数在邻居集合中随机采样
45 |             samp_neighs = [_set(_sample(to_neigh, 
46 |                             num_sample,
47 |                             )) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
48 |         else:#不然就是所有的邻居
49 |             samp_neighs = to_neighs
50 | 
51 |         if self.gcn:#如果用gcn聚合，要加入自环
52 |             samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
53 |         unique_nodes_list = list(set.union(*samp_neighs)) #去掉重复的采样
54 |         unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
55 |         #一个(len(samp_neighs), len(unique_nodes)维度的mask矩阵
56 |         mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes))) 
57 |         column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]   
58 |         row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
59 |         mask[row_indices, column_indices] = 1
60 |         if self.cuda:
61 |             mask = mask.cuda()
62 |         num_neigh = mask.sum(1, keepdim=True)
63 |         mask = mask.div(num_neigh)
64 |         if self.cuda:
65 |             embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
66 |         else:
67 |             embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
68 |         to_feats = mask.mm(embed_matrix)
69 |         return to_feats
70 | 


--------------------------------------------------------------------------------
/GraphSAGE/encoders.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 27, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | from torch.nn import init
 9 | import torch.nn.functional as F
10 | 
11 | class Encoder(nn.Module):
12 |     """
13 |     GraphSage的聚合方式可改，这个py是使用"卷积"(GCN)的GraphSage去encode节点特征。
14 |     下一个py是用mean方式聚合的，会调用这个函数完成功能。
15 |     """
16 |     def __init__(self, features, feature_dim, 
17 |             embed_dim, adj_lists, aggregator,
18 |             num_sample=10,
19 |             base_model=None, gcn=False, cuda=False, 
20 |             feature_transform=False): 
21 |         super(Encoder, self).__init__()
22 | 
23 |         self.features = features #节点特征
24 |         self.feat_dim = feature_dim #特征维度
25 |         self.adj_lists = adj_lists #邻接矩阵
26 |         self.aggregator = aggregator #聚合器
27 |         self.num_sample = num_sample #采样的数目
28 |         if base_model != None: #base模型
29 |             self.base_model = base_model
30 | 
31 |         self.gcn = gcn #GCN聚合
32 |         self.embed_dim = embed_dim #嵌入维度
33 |         self.cuda = cuda #gpu
34 |         self.aggregator.cuda = cuda
35 |         #如果用gcn聚合，那么维度就直接是feat_dim。
36 |         #如果不是gcn，那就仍然是需要mean邻居再与自己concat，维度是2倍。
37 |         self.weight = nn.Parameter(
38 |                 torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
39 |         init.xavier_uniform(self.weight) #Xavier初始化
40 | 
41 |     def forward(self, nodes):
42 |         """
43 |         为每个batch的node生成嵌入特征
44 |         """
45 |         neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[int(node)] for node in nodes], 
46 |                 self.num_sample) #按采样数采样节点后，把邻接矩阵输入模型
47 |         if not self.gcn:
48 |             if self.cuda:
49 |                 self_feats = self.features(torch.LongTensor(nodes).cuda())
50 |             else:
51 |                 self_feats = self.features(torch.LongTensor(nodes))
52 |             #如果不是gcn聚合，需要concat，维度是两倍
53 |             combined = torch.cat([self_feats, neigh_feats], dim=1)
54 |         else:
55 |             combined = neigh_feats #如果是gcn就不变
56 |         combined = F.relu(self.weight.mm(combined.t()))#最后乘权重再激活
57 |         return combined
58 | 


--------------------------------------------------------------------------------
/HetGNN/DeepWalk.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 27, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import string;
 7 | import re;
 8 | import random
 9 | import math
10 | import numpy as np
11 | from gensim.models import Word2Vec
12 | from itertools import *
13 | dimen = 128
14 | window = 5
15 | 
16 | #生成预训练节点嵌入
17 | #测试集，所以是提前游走好了的语料库。
18 | 
19 | def read_random_walk_corpus():
20 |         #读入随机游走语料库
21 | 	walks=[]
22 | 	#生成的随机游走作为用于模型训练的节点序列
23 | 	#inputfile = open("../data/academic_test/meta_random_walk_APVPA_test.txt","r")
24 | 	inputfile = open("../data/academic_test/het_random_walk_test.txt", "r")
25 | 	for line in inputfile:
26 | 		path = []
27 | 		node_list=re.split(' ',line)
28 | 		for i in range(len(node_list)):
29 | 			path.append(node_list[i])			
30 | 		walks.append(path)
31 | 	inputfile.close()
32 | 	return walks
33 | 
34 | 
35 | walk_corpus = read_random_walk_corpus()
36 | model = Word2Vec(walk_corpus, size = dimen, window = window, min_count = 0, workers = 2, sg = 1, hs = 0, negative = 5)
37 | 
38 | 
39 | print("Output...")
40 | #model.wv.save_word2vec_format("../data/node_embedding.txt")
41 | model.wv.save_word2vec_format("../data/academic_test/node_net_embedding.txt")
42 | 


--------------------------------------------------------------------------------
/HetGNN/HetGNN.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 27, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import torch
  7 | import torch.optim as optim
  8 | import data_generator
  9 | import tools
 10 | from args import read_args
 11 | from torch.autograd import Variable
 12 | import numpy as np
 13 | import random
 14 | torch.set_num_threads(2) #设置线程
 15 | import os
 16 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 17 | 
 18 | 
 19 | class model_class(object):
 20 | 	def __init__(self, args):
 21 | 		super(model_class, self).__init__()
 22 | 		self.args = args
 23 | 		self.gpu = args.cuda
 24 | 
 25 |                 #导入各种数据
 26 | 		input_data = data_generator.input_data(args = self.args)
 27 | 		#input_data.gen_het_rand_walk()
 28 | 
 29 | 		self.input_data = input_data
 30 | 
 31 | 		if self.args.train_test_label == 2: #为每个node生成邻居
 32 |                         #重启策略的随机游走，为每个节点采样固定数量的强相关的异构邻居，然后按类型分组
 33 |                         #任意节点开始随机游走，以p概率返回。采样到固定数量后就停止。
 34 |                         #为了采样采样的邻居包含所有类型的节点，不同类型节点的数量是受限的。
 35 |                         #对每个类型都选出按频率的topk邻居
 36 | 			input_data.het_walk_restart()
 37 | 			print ("neighbor set generation finish")
 38 | 			exit(0)
 39 | 
 40 |                 #p是论文，a是作者，v是地点，然后可以组成一堆特征
 41 | 		feature_list = [input_data.p_abstract_embed, input_data.p_title_embed,\
 42 | 		input_data.p_v_net_embed, input_data.p_a_net_embed, input_data.p_ref_net_embed,\
 43 | 		input_data.p_net_embed, input_data.a_net_embed, input_data.a_text_embed,\
 44 | 		input_data.v_net_embed, input_data.v_text_embed]
 45 | 
 46 | 		for i in range(len(feature_list)):
 47 | 			feature_list[i] = torch.from_numpy(np.array(feature_list[i])).float()
 48 | 
 49 | 		if self.gpu:
 50 | 			for i in range(len(feature_list)):
 51 | 				feature_list[i] = feature_list[i].cuda()
 52 | 		#self.feature_list = feature_list
 53 | 
 54 |                 #各自的邻居列表
 55 | 		a_neigh_list_train = input_data.a_neigh_list_train
 56 | 		p_neigh_list_train = input_data.p_neigh_list_train
 57 | 		v_neigh_list_train = input_data.v_neigh_list_train
 58 | 
 59 | 		a_train_id_list = input_data.a_train_id_list
 60 | 		p_train_id_list = input_data.p_train_id_list
 61 | 		v_train_id_list = input_data.v_train_id_list
 62 | 
 63 | 		self.model = tools.HetAgg(args, feature_list, a_neigh_list_train, p_neigh_list_train, v_neigh_list_train,\
 64 | 		 a_train_id_list, p_train_id_list, v_train_id_list)#实例化model，tools会对异构的信息进行聚合
 65 | 
 66 | 		if self.gpu:
 67 | 			self.model.cuda()
 68 | 		self.parameters = filter(lambda p: p.requires_grad, self.model.parameters())
 69 | 		self.optim = optim.Adam(self.parameters, lr=self.args.lr, weight_decay = 0)#Adam优化器
 70 | 		self.model.init_weights()
 71 | 
 72 | 
 73 | 	def model_train(self):
 74 |                 #开始训练
 75 | 		print ('model training ...')
 76 | 		if self.args.checkpoint != '':
 77 | 			self.model.load_state_dict(torch.load(self.args.checkpoint))
 78 | 		
 79 | 		self.model.train() #模型调到训练模式
 80 | 		mini_batch_s = self.args.mini_batch_s #batch
 81 | 		embed_d = self.args.embed_d #嵌入维度
 82 | 
 83 | 		for iter_i in range(self.args.train_iter_n): #迭代次数
 84 | 			print ('iteration ' + str(iter_i) + ' ...')
 85 | 			triple_list = self.input_data.sample_het_walk_triple()#异构三元组（含正例 负例）采样
 86 | 			min_len = 1e10
 87 | 			for ii in range(len(triple_list)):
 88 | 				if len(triple_list[ii]) < min_len:
 89 | 					min_len = len(triple_list[ii])
 90 | 			batch_n = int(min_len / mini_batch_s)
 91 | 			print (batch_n)
 92 | 			for k in range(batch_n):
 93 | 				c_out = torch.zeros([len(triple_list), mini_batch_s, embed_d])
 94 | 				p_out = torch.zeros([len(triple_list), mini_batch_s, embed_d])#pos，正例
 95 | 				n_out = torch.zeros([len(triple_list), mini_batch_s, embed_d])#neg，负例
 96 | 
 97 | 				for triple_index in range(len(triple_list)):
 98 | 					triple_list_temp = triple_list[triple_index]
 99 | 					triple_list_batch = triple_list_temp[k * mini_batch_s : (k + 1) * mini_batch_s]
100 | 					#得到模型的预测结果
101 | 					c_out_temp, p_out_temp, n_out_temp = self.model(triple_list_batch, triple_index)
102 | 
103 | 					c_out[triple_index] = c_out_temp
104 | 					p_out[triple_index] = p_out_temp
105 | 					n_out[triple_index] = n_out_temp
106 | 
107 | 				loss = tools.cross_entropy_loss(c_out, p_out, n_out, embed_d)#计算三元组交叉熵
108 | 
109 | 				self.optim.zero_grad()#梯度清零
110 | 				loss.backward()#反向传播
111 | 				self.optim.step() #参数更新
112 | 
113 | 				if k % 100 == 0: #打印结果
114 | 					print ("loss: " + str(loss))
115 | 
116 | 			if iter_i % self.args.save_model_freq == 0:
117 | 				torch.save(self.model.state_dict(), self.args.model_path + "HetGNN_" + str(iter_i) + ".pt")
118 | 				#存储参数用于评估
119 | 				triple_index = 9 #一共有9种case，在tools文件中定义
120 | 				a_out, p_out, v_out = self.model([], triple_index)
121 | 			print ('iteration ' + str(iter_i) + ' finish.')
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 | 	args = read_args()
127 | 	print("------arguments-------")
128 | 	for k, v in vars(args).items():
129 | 		print(k + ': ' + str(v))
130 | 
131 | 	#可复现随机种子
132 | 	random.seed(args.random_seed)
133 | 	np.random.seed(args.random_seed)
134 | 	torch.manual_seed(args.random_seed)
135 | 	torch.cuda.manual_seed_all(args.random_seed)
136 | 
137 | 	#实例化模型
138 | 	model_object = model_class(args)
139 | 
140 | 	if args.train_test_label == 0:
141 | 		model_object.model_train()
142 | 


--------------------------------------------------------------------------------
/HetGNN/README.md:
--------------------------------------------------------------------------------
 1 | # Heterogeneous Graph Neural Network(HetGNN)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/104344930
 6 | 
 7 | 建议阅读顺序： DeepWalk.py-->tools.py-->HetGNN.py
 8 | 
 9 | #
10 | 
11 | 原paper：Zhang, Chuxu and Song, Dongjin and Huang, Chao and Swami, Ananthram and Chawla, Nitesh V. Heterogeneous Graph Neural Network，KDD2019
12 | 
13 | 原code： https://github.com/chuxuzhang/KDD2019_HetGNN
14 | 


--------------------------------------------------------------------------------
/IRGAN/README.md:
--------------------------------------------------------------------------------
 1 | # IRGAN
 2 | 
 3 | 源码笔记：传说中的检索大统一，建议只看论文所有涉及motivation的部分。细看代码代码会发现实际和代码的论述有一些差别，代码实现上挺简单的。
 4 | 
 5 | 建议阅读顺序：utils-->gen_model-->dis_model-->cf_gan-->dis_model_dns-->cf_dns
 6 | 
 7 | #
 8 | 
 9 | 原paper：IRGAN: A Minimax Game for Unifying Generative and Discriminative Information Retrieval Models，SIGIR2017
10 | 
11 | 原code： https://github.com/geek-ai/irgan
12 | 


--------------------------------------------------------------------------------
/IRGAN/dis_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | import cPickle
 8 | 
 9 | #cPickle是python2.x，3.x改名为Pickcle。它可以序列化任何对象并保存，存网络参数很合适
10 | 
11 | #判别器。作用是对生成器得到的文档二分类（相关和不相关），进一步分类。
12 | #生成器是多分类，判别器是二分类，两者对抗的结果感觉上是用GAN来找和正样本更相似的负样本（代替了随机采样）。
13 | class DIS():
14 |     def __init__(self, itemNum, userNum, emb_dim, lamda, param=None, initdelta=0.05, learning_rate=0.05):
15 |         self.itemNum = itemNum
16 |         self.userNum = userNum
17 |         self.emb_dim = emb_dim
18 |         self.lamda = lamda  # 正则化参数
19 |         self.param = param
20 |         self.initdelta = initdelta
21 |         self.learning_rate = learning_rate
22 |         self.d_params = []
23 | 
24 |         with tf.variable_scope('discriminator'):#参数初始化
25 |             if self.param == None:#无设置就随机初始化
26 |                 self.user_embeddings = tf.Variable(
27 |                     tf.random_uniform([self.userNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
28 |                                       dtype=tf.float32))
29 |                 self.item_embeddings = tf.Variable(
30 |                     tf.random_uniform([self.itemNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
31 |                                       dtype=tf.float32))
32 |                 self.item_bias = tf.Variable(tf.zeros([self.itemNum]))
33 |             else:
34 |                 self.user_embeddings = tf.Variable(self.param[0])
35 |                 self.item_embeddings = tf.Variable(self.param[1])
36 |                 self.item_bias = tf.Variable(self.param[2])
37 | 
38 |         self.d_params = [self.user_embeddings, self.item_embeddings, self.item_bias]#嵌入权重和item偏置
39 | 
40 |         self.u = tf.placeholder(tf.int32)
41 |         self.i = tf.placeholder(tf.int32)
42 |         self.label = tf.placeholder(tf.float32)#文档相关或者不相关的二分类label
43 | 
44 |         #嵌入user和item
45 |         self.u_embedding = tf.nn.embedding_lookup(self.user_embeddings, self.u)
46 |         self.i_embedding = tf.nn.embedding_lookup(self.item_embeddings, self.i)
47 |         self.i_bias = tf.gather(self.item_bias, self.i)
48 | 
49 |         #计算user和item的相似度，然后计算与label的交叉熵为判别器loss
50 |         self.pre_logits = tf.reduce_sum(tf.multiply(self.u_embedding, self.i_embedding), 1) + self.i_bias
51 |         self.pre_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label,
52 |                                                                 logits=self.pre_logits) + self.lamda * (
53 |             tf.nn.l2_loss(self.u_embedding) + tf.nn.l2_loss(self.i_embedding) + tf.nn.l2_loss(self.i_bias)
54 |         )
55 | 
56 |         d_opt = tf.train.GradientDescentOptimizer(self.learning_rate)#梯度下降
57 |         self.d_updates = d_opt.minimize(self.pre_loss, var_list=self.d_params)#参数更新
58 | 
59 |         #生成器的reward是判别器相似度判别分数
60 |         self.reward_logits = tf.reduce_sum(tf.multiply(self.u_embedding, self.i_embedding),
61 |                                            1) + self.i_bias
62 |         
63 |         #重点要说这里！！！这个reward很paper中写的不一致！（不止这里，四个任务的reward的都不是原文的）
64 |         #虽然reward这样设置不是不可以，因为Sigmoid之后处于0-1，小于0.5reward相减为负，这样避免reward都是正数。
65 |         self.reward = 2 * (tf.sigmoid(self.reward_logits) - 0.5)
66 | 
67 |         #为了测试集算topk的rating分数, 是矩阵乘矩阵，得到所有得分，self.u: [batch_size]
68 |         self.all_rating = tf.matmul(self.u_embedding, self.item_embeddings, transpose_a=False,
69 |                                     transpose_b=True) + self.item_bias
70 | 
71 |         self.all_logits = tf.reduce_sum(tf.multiply(self.u_embedding, self.item_embeddings), 1) + self.item_bias
72 |         self.NLL = -tf.reduce_mean(tf.log(
73 |             tf.gather(tf.reshape(tf.nn.softmax(tf.reshape(self.all_logits, [1, -1])), [-1]), self.i))
74 |         )
75 |         #动态负采样算排名。公式一样喂入不同（这里所有的逻辑都在__init__里面....）
76 |         self.dns_rating = tf.reduce_sum(tf.multiply(self.u_embedding, self.item_embeddings), 1) + self.item_bias
77 | 
78 |     def save_model(self, sess, filename):#保存参数
79 |         param = sess.run(self.d_params)
80 |         cPickle.dump(param, open(filename, 'w'))
81 | 


--------------------------------------------------------------------------------
/IRGAN/dis_model_dns.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | import cPickle
 8 | 
 9 | #cPickle是python2.x，3.x改名为Pickcle。它可以序列化任何对象并保存，存网络参数很合适
10 | 
11 | 
12 | #带动态负采样的判别器。
13 | #GAN思想的另一方面就是生成负采样，在cf_dns文件中，这里是利用正负例构建判别器。
14 | class DIS():
15 |     def __init__(self, itemNum, userNum, emb_dim, lamda, param=None, initdelta=0.05, learning_rate=0.05):
16 |         self.itemNum = itemNum
17 |         self.userNum = userNum
18 |         self.emb_dim = emb_dim
19 |         self.lamda = lamda  # 正则化参数
20 |         self.param = param
21 |         self.initdelta = initdelta
22 |         self.learning_rate = learning_rate
23 |         self.d_params = []
24 | 
25 |         with tf.variable_scope('discriminator'):#参数初始化
26 |             if self.param == None:#无设置就随机初始化
27 |                 self.user_embeddings = tf.Variable(
28 |                     tf.random_uniform([self.userNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
29 |                                       dtype=tf.float32))
30 |                 self.item_embeddings = tf.Variable(
31 |                     tf.random_uniform([self.itemNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
32 |                                       dtype=tf.float32))
33 |                 self.item_bias = tf.Variable(tf.zeros([self.itemNum]))
34 |             else:
35 |                 self.user_embeddings = tf.Variable(self.param[0])
36 |                 self.item_embeddings = tf.Variable(self.param[1])
37 |                 self.item_bias = tf.Variable(self.param[2])
38 | 
39 |         self.d_params = [self.user_embeddings, self.item_embeddings, self.item_bias]#嵌入权重和item偏置
40 | 
41 |         self.u = tf.placeholder(tf.int32)
42 |         self.pos = tf.placeholder(tf.int32)#正例
43 |         self.neg = tf.placeholder(tf.int32)#负例
44 | 
45 |         ##嵌入user和item（正负例）
46 |         self.u_embedding = tf.nn.embedding_lookup(self.user_embeddings, self.u)
47 |         self.pos_embedding = tf.nn.embedding_lookup(self.item_embeddings, self.pos)
48 |         self.pos_bias = tf.gather(self.item_bias, self.pos)
49 |         self.neg_embedding = tf.nn.embedding_lookup(self.item_embeddings, self.neg)
50 |         self.neg_bias = tf.gather(self.item_bias, self.neg)
51 | 
52 |         #计算pos和neg差值和user的相似度，然后log+正则得到loss
53 |         self.pre_logits = tf.sigmoid(
54 |             tf.reduce_sum(tf.multiply(self.u_embedding, self.pos_embedding - self.neg_embedding),
55 |                           1) + self.pos_bias - self.neg_bias)
56 |         self.pre_loss = -tf.reduce_mean(tf.log(self.pre_logits)) + self.lamda * (
57 |             tf.nn.l2_loss(self.u_embedding) +
58 |             tf.nn.l2_loss(self.pos_embedding) +
59 |             tf.nn.l2_loss(self.pos_bias) +
60 |             tf.nn.l2_loss(self.neg_embedding) +
61 |             tf.nn.l2_loss(self.neg_bias)
62 |         )
63 | 
64 |         d_opt = tf.train.GradientDescentOptimizer(self.learning_rate)#梯度下降
65 |         self.d_updates = d_opt.minimize(self.pre_loss, var_list=self.d_params)#参数更新
66 | 
67 |         #为了测试集算topk的rating分数, self.u: [batch_size]
68 |         self.all_rating = tf.matmul(self.u_embedding, self.item_embeddings, transpose_a=False,
69 |                                     transpose_b=True) + self.item_bias
70 | 
71 |         self.all_logits = tf.reduce_sum(tf.multiply(self.u_embedding, self.item_embeddings), 1) + self.item_bias
72 |         #动态负采样算排名。公式一样喂入不同（这里所有的逻辑都在__init__里面....）
73 |         self.dns_rating = tf.reduce_sum(tf.multiply(self.u_embedding, self.item_embeddings), 1) + self.item_bias
74 | 
75 |     def save_model(self, sess, filename):#保存参数
76 |         param = sess.run(self.d_params)
77 |         cPickle.dump(param, open(filename, 'w'))
78 | 


--------------------------------------------------------------------------------
/IRGAN/gen_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | import cPickle
 8 | 
 9 | #cPickle是python2.x，3.x改名为Pickcle。它可以序列化任何对象并保存，存网络参数很合适
10 | 
11 | #生成器实际上是softmax函数，对文档做多分类得到“生成”的候选。
12 | #生成器是多分类，判别器是二分类，两者对抗的结果感觉上是用GAN来找和正样本更相似的负样本（代替了随机采样）。
13 | class GEN():
14 |     def __init__(self, itemNum, userNum, emb_dim, lamda, param=None, initdelta=0.05, learning_rate=0.05):
15 |         self.itemNum = itemNum
16 |         self.userNum = userNum
17 |         self.emb_dim = emb_dim
18 |         self.lamda = lamda  # 正则化参数
19 |         self.param = param
20 |         self.initdelta = initdelta
21 |         self.learning_rate = learning_rate
22 |         self.g_params = []
23 | 
24 |         with tf.variable_scope('generator'):#参数初始化
25 |             if self.param == None:#无设置就随机初始化
26 |                 self.user_embeddings = tf.Variable(
27 |                     tf.random_uniform([self.userNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
28 |                                       dtype=tf.float32))
29 |                 self.item_embeddings = tf.Variable(
30 |                     tf.random_uniform([self.itemNum, self.emb_dim], minval=-self.initdelta, maxval=self.initdelta,
31 |                                       dtype=tf.float32))
32 |                 self.item_bias = tf.Variable(tf.zeros([self.itemNum]))
33 |             else:
34 |                 self.user_embeddings = tf.Variable(self.param[0])
35 |                 self.item_embeddings = tf.Variable(self.param[1])
36 |                 self.item_bias = tf.Variable(param[2])
37 | 
38 |             self.g_params = [self.user_embeddings, self.item_embeddings, self.item_bias]#嵌入权重和item偏置
39 | 
40 |         self.u = tf.placeholder(tf.int32)
41 |         self.i = tf.placeholder(tf.int32)
42 |         self.reward = tf.placeholder(tf.float32)#生成器的reward
43 | 
44 |         #嵌入user和item
45 |         self.u_embedding = tf.nn.embedding_lookup(self.user_embeddings, self.u)
46 |         self.i_embedding = tf.nn.embedding_lookup(self.item_embeddings, self.i)
47 |         self.i_bias = tf.gather(self.item_bias, self.i)
48 | 
49 |         #计算user和item的相似度，然后softmax得到相关或者不相关的概率
50 |         self.all_logits = tf.reduce_sum(tf.multiply(self.u_embedding, self.item_embeddings), 1) + self.item_bias
51 |         self.i_prob = tf.gather(
52 |             tf.reshape(tf.nn.softmax(tf.reshape(self.all_logits, [1, -1])), [-1]),
53 |             self.i)
54 | 
55 |         #相似度*reward+正则就是生成器的loss，reward来自判别器
56 |         self.gan_loss = -tf.reduce_mean(tf.log(self.i_prob) * self.reward) + self.lamda * (
57 |             tf.nn.l2_loss(self.u_embedding) + tf.nn.l2_loss(self.i_embedding) + tf.nn.l2_loss(self.i_bias))
58 | 
59 |         g_opt = tf.train.GradientDescentOptimizer(self.learning_rate)#梯度下降
60 |         self.gan_updates = g_opt.minimize(self.gan_loss, var_list=self.g_params)#参数更新
61 | 
62 |         # for test stage, self.u: [batch_size]
63 |         self.all_rating = tf.matmul(self.u_embedding, self.item_embeddings, transpose_a=False,
64 |                                     transpose_b=True) + self.item_bias
65 | 
66 |     def save_model(self, sess, filename):#保存模型
67 |         param = sess.run(self.g_params)
68 |         cPickle.dump(param, open(filename, 'w'))
69 | 


--------------------------------------------------------------------------------
/IRGAN/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 30, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import linecache
  7 | import numpy as np
  8 | 
  9 | #linecache读取大文件。其将文件读到内存缓存而不是每次从硬盘，提高效率。
 10 | #这个py主要是得到数据+各种度量方式（precision，NDCG等）
 11 | 
 12 | def file_len(fname):
 13 |     with open(fname) as f:
 14 |         for i, l in enumerate(f):
 15 |             pass
 16 |     return i + 1
 17 | 
 18 | 
 19 | #从训练集得到批次数据
 20 | def get_batch_data(file, index, size):  # 1,5->1,2,3,4,5
 21 |     user = []
 22 |     item = []
 23 |     label = []
 24 |     for i in range(index, index + size):#按批次大小
 25 |         line = linecache.getline(file, i)
 26 |         line = line.strip()
 27 |         line = line.split()
 28 |         user.append(int(line[0]))#append两次，为了构建正例负例对
 29 |         user.append(int(line[0]))
 30 |         item.append(int(line[1]))#正例
 31 |         item.append(int(line[2]))#负例
 32 |         label.append(1.)#正例
 33 |         label.append(0.)#负例
 34 |     return user, item, label
 35 | 
 36 | #topk准确率
 37 | def precision_at_k(r, k):
 38 |     """Score is precision @ k
 39 |     Relevance is binary (nonzero is relevant).（相关或者不相关的二分类）
 40 |     Returns:
 41 |         Precision @ k
 42 |     Raises:
 43 |         ValueError: len(r) must be >= k
 44 |     """
 45 |     assert k >= 1
 46 |     r = np.asarray(r)[:k]#得到topk再mean
 47 |     return np.mean(r)
 48 | 
 49 | #topk的平均准确率
 50 | def average_precision(r):
 51 |     """Score is average precision (area under PR curve)
 52 |     Relevance is binary (nonzero is relevant).（相关或者不相关的二分类）
 53 |     Returns:
 54 |         Average precision
 55 |     """
 56 |     r = np.asarray(r)
 57 |     #计算所有topk（从1到所有）的结果mean
 58 |     out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
 59 |     if not out:
 60 |         return 0.
 61 |     return np.mean(out)
 62 | 
 63 | #平均准确率
 64 | def mean_average_precision(rs):
 65 |     """Score is mean average precision
 66 |     Relevance is binary (nonzero is relevant).（相关或者不相关的二分类）
 67 |     Returns:
 68 |         Mean average precision
 69 |     """
 70 |     return np.mean([average_precision(r) for r in rs])
 71 | 
 72 | 
 73 | #计算topk的DCG，累积折损增益
 74 | def dcg_at_k(r, k, method=1):
 75 |     """Score is discounted cumulative gain (dcg)
 76 |     Relevance is positive real values.  Can use binary
 77 |     as the previous methods.
 78 |     Returns:
 79 |         Discounted cumulative gain
 80 |     """
 81 |     r = np.asfarray(r)[:k]
 82 |     if r.size:
 83 |         if method == 0:
 84 |             return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
 85 |         elif method == 1:
 86 |             return np.sum(r / np.log2(np.arange(2, r.size + 2)))
 87 |         else:
 88 |             raise ValueError('method must be 0 or 1.')
 89 |     return 0.
 90 | 
 91 | #计算NDCG，归一化的DCG
 92 | def ndcg_at_k(r, k, method=1):
 93 |     """Score is normalized discounted cumulative gain (ndcg)
 94 |     Relevance is positive real values.  Can use binary
 95 |     as the previous methods.
 96 |     Returns:
 97 |         Normalized discounted cumulative gain
 98 |     """
 99 |     dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)#得到最大DCG
100 |     if not dcg_max:
101 |         return 0.
102 |     return dcg_at_k(r, k, method) / dcg_max #归一化
103 | 
104 | #计算topk的recall结果
105 | def recall_at_k(r, k, all_pos_num):
106 |     r = np.asfarray(r)[:k]
107 |     return np.sum(r) / all_pos_num #正例/所有正例
108 | 
109 | #F1
110 | def F1(pre, rec):
111 |     if pre + rec > 0:
112 |         return (2.0 * pre * rec) / (pre + rec)
113 |     else:
114 |         return 0.
115 | 


--------------------------------------------------------------------------------
/InfoGAN/README.md:
--------------------------------------------------------------------------------
 1 | # Mutual Information Generative Adversarial Networks (InfoGAN)
 2 | 
 3 | 逐行源码阅读中文笔记。InfoGAN比起GAN的随机噪声中多了十来维的控制噪音c（如Mnist的数字类别，粗细等），然后优化互信息用了一个Q网络来得到“下界”（实际上这个网络是预测了类别c）。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/96306813
 6 | 
 7 | 建议阅读顺序：model.py-->trainer.py-->main.py
 8 | 
 9 | #
10 | 
11 | 原paper： Mutual Information Generative Adversarial Networks
12 | 
13 | 原code： https://github.com/JonathanRaiman/tensorflow-infogan
14 | 
15 | #
16 | pytorch版本：https://github.com/pianomania/infoGAN-pytorch
17 | 


--------------------------------------------------------------------------------
/InfoGAN/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 5, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | from model import *
 7 | from trainer import Trainer
 8 | 
 9 | 
10 | fe = FrontEnd()#D和Q的前端部分
11 | #三个part
12 | d = D()
13 | q = Q()
14 | g = G()
15 | 
16 | for i in [fe, d, q, g]:
17 |   i.cuda()
18 |   i.apply(weights_init) #权重初始化
19 | 
20 | trainer = Trainer(g, fe, d, q)#开始训练
21 | trainer.train()
22 | 


--------------------------------------------------------------------------------
/InfoGAN/model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 5, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import torch.nn as nn
  7 | 
  8 | 
  9 | class FrontEnd(nn.Module):
 10 |   '''discriminator and Q 的前端部分，Q通过与D共享卷积层，可以减少计算花销。'''
 11 | 
 12 |   def __init__(self):
 13 |     super(FrontEnd, self).__init__()
 14 | 
 15 |     self.main = nn.Sequential(
 16 |       nn.Conv2d(1, 64, 4, 2, 1),
 17 |       nn.LeakyReLU(0.1, inplace=True),
 18 |       nn.Conv2d(64, 128, 4, 2, 1, bias=False),
 19 |       nn.BatchNorm2d(128),
 20 |       nn.LeakyReLU(0.1, inplace=True),
 21 |       nn.Conv2d(128, 1024, 7, bias=False),
 22 |       nn.BatchNorm2d(1024),
 23 |       nn.LeakyReLU(0.1, inplace=True),
 24 |     )
 25 | 
 26 |   def forward(self, x):
 27 |     output = self.main(x)
 28 |     return output
 29 | 
 30 | 
 31 | #判别器，判断输入是真实的还是生成的
 32 | class D(nn.Module):
 33 | 
 34 |   def __init__(self):
 35 |     super(D, self).__init__()
 36 |     
 37 |     self.main = nn.Sequential(
 38 |       nn.Conv2d(1024, 1, 1), 
 39 |       nn.Sigmoid() #直接得到1维的预测结果
 40 |     )
 41 |     
 42 | 
 43 |   def forward(self, x):
 44 |     output = self.main(x).view(-1, 1)#n个样本，每个样本一个维度，就是预测值
 45 |     return output
 46 | 
 47 | 
 48 | #直接优化互信息太困难了，所有用辅助分布Q(c|x)来近似
 49 | #D判断真假，Q判断类别c
 50 | class Q(nn.Module):
 51 | 
 52 |   def __init__(self):
 53 |     super(Q, self).__init__()
 54 | 
 55 |     self.conv = nn.Conv2d(1024, 128, 1, bias=False)
 56 |     self.bn = nn.BatchNorm2d(128)
 57 |     self.lReLU = nn.LeakyReLU(0.1, inplace=True)
 58 |     self.conv_disc = nn.Conv2d(128, 10, 1) #10维，因为数据是mnist识别数字
 59 |     self.conv_mu = nn.Conv2d(128, 2, 1)
 60 |     self.conv_var = nn.Conv2d(128, 2, 1)
 61 | 
 62 |   def forward(self, x):
 63 |     #给出一个互信息的下界
 64 |     y = self.conv(x)
 65 | 
 66 |     disc_logits = self.conv_disc(y).squeeze()#得到10类别分数
 67 | 
 68 |     mu = self.conv_mu(y).squeeze() #得到高斯公式的均值mu
 69 |     var = self.conv_var(y).squeeze().exp() #得到高斯公式的方差var
 70 | 
 71 |     return disc_logits, mu, var 
 72 | 
 73 | #生成器，用噪声生成目标数据
 74 | class G(nn.Module):
 75 | 
 76 |   def __init__(self):
 77 |     super(G, self).__init__()
 78 | 
 79 |     self.main = nn.Sequential(
 80 |       #噪音74维，62维初始噪音大小，10维控制数字类别的生成，2维的角度和字体宽度
 81 |       nn.ConvTranspose2d(74, 1024, 1, 1, bias=False), #反卷积
 82 |       nn.BatchNorm2d(1024),
 83 |       nn.ReLU(True),
 84 |       nn.ConvTranspose2d(1024, 128, 7, 1, bias=False), #7x7反卷积
 85 |       nn.BatchNorm2d(128),
 86 |       nn.ReLU(True),
 87 |       nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),#步长变2加padding
 88 |       nn.BatchNorm2d(64),
 89 |       nn.ReLU(True),
 90 |       nn.ConvTranspose2d(64, 1, 4, 2, 1, bias=False),
 91 |       nn.Sigmoid()
 92 |     )
 93 | 
 94 |   def forward(self, x):
 95 |     output = self.main(x) #得到1维的结果。
 96 |     return output
 97 | 
 98 | #权重初始化
 99 | def weights_init(m):
100 |     classname = m.__class__.__name__
101 |     if classname.find('Conv') != -1:
102 |         m.weight.data.normal_(0.0, 0.02) #卷积层用均值0，方差0.02随机初始化
103 |     elif classname.find('BatchNorm') != -1:
104 |         m.weight.data.normal_(1.0, 0.02)#BN用均值1，方差0.02（要正则归一）
105 |         m.bias.data.fill_(0) #偏差用0填充
106 | 


--------------------------------------------------------------------------------
/LightGCN/README.md:
--------------------------------------------------------------------------------
 1 | # LightGCN
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/106970194
 6 | 
 7 | #
 8 | 原paper： Xiangnan He, Kuan Deng ,Xiang Wang, Yan Li, Yongdong Zhang, Meng Wang(2020). LightGCN: Simplifying and Powering Graph Convolution Network for Recommendatio
 9 | 
10 | 原code： https://github.com/kuandeng/LightGCN
11 | 


--------------------------------------------------------------------------------
/Louvain/README.md:
--------------------------------------------------------------------------------
 1 | # Fast unfolding of communities in large networks(Louvain)
 2 | 
 3 | email-Eu-core数据集下的python实现的低配版本
 4 | 
 5 | #
 6 | 
 7 | 高配版本指路：https://github.com/taynaud/python-louvain
 8 | 
 9 | # awesome-community-detection
10 | 
11 | https://github.com/benedekrozemberczki/awesome-community-detection
12 | 


--------------------------------------------------------------------------------
/MIL-NCE/README.md:
--------------------------------------------------------------------------------
1 | # HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips
2 | 
3 | 逐行源码阅读中文笔记。
4 | 
5 | 建议阅读顺序：main-->video_loader-->loss-->s3dg
6 | 
7 | #
8 | 原code：https://github.com/antoine77340/MIL-NCE_HowTo100M
9 | 


--------------------------------------------------------------------------------
/MIL-NCE/loss.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 1, 2021
 3 | @author: nakaizura
 4 | '''
 5 | import torch as th
 6 | 
 7 | 
 8 | # 这个就是计算text和video的MIL-NCE了
 9 | class MILNCELoss(th.nn.Module):
10 |     def __init__(self):
11 |         super(MILNCELoss, self).__init__()
12 | 
13 |     def forward(self, video_embd, text_embd):
14 |         x = th.matmul(video_embd, text_embd.t())# 计算相似度
15 |         x = x.view(video_embd.shape[0], video_embd.shape[0], -1)
16 |         # 正例对们
17 |         nominator = x * th.eye(x.shape[0])[:,:,None].cuda() # 对角线即使对应的正例
18 |         nominator = nominator.sum(dim=1)
19 |         nominator = th.logsumexp(nominator, dim=1) # 计算log sum exp
20 |         # 正例对们+负例对们
21 |         denominator = th.cat((x, x.permute(1,0,2)), dim=1).view(x.shape[0], -1)# 所以这里多拼接了负例对们
22 |         denominator = th.logsumexp(denominator, dim=1)
23 |         return th.mean(denominator - nominator)# 相减得到定义的loss
24 | 


--------------------------------------------------------------------------------
/MLP-Mixer/README.md:
--------------------------------------------------------------------------------
 1 | # MLP-Mixer: An all-MLP Architecture for Vision
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://nakaizura.blog.csdn.net/article/details/118878384
 6 | 
 7 | # 
 8 | 
 9 | 原paper：
10 | ```
11 | @article{dosovitskiy2020,
12 |   title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
13 |   author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
14 |   journal={ICLR},
15 |   year={2021}
16 | }
17 | 
18 | @article{tolstikhin2021,
19 |   title={MLP-Mixer: An all-MLP Architecture for Vision},
20 |   author={Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, Thomas and Yung, Jessica and Steiner, Andreas and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and Dosovitskiy, Alexey},
21 |   journal={arXiv preprint arXiv:2105.01601},
22 |   year={2021}
23 | }
24 | ```
25 | 
26 | 
27 | 原code： https://github.com/dongx-duan/bpr
28 | 


--------------------------------------------------------------------------------
/MLP-Mixer/models_mixer.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import einops
 4 | import flax.linen as nn
 5 | import jax.numpy as jnp
 6 | 
 7 | 
 8 | class MlpBlock(nn.Module):
 9 |   mlp_dim: int
10 | 
11 |   @nn.compact
12 |   def __call__(self, x): #这部分和上图的右侧MLP一致
13 |     y = nn.Dense(self.mlp_dim)(x) #MLP+GELU+MLP
14 |     y = nn.gelu(y)
15 |     return nn.Dense(x.shape[-1])(y)
16 | 
17 | 
18 | class MixerBlock(nn.Module):
19 |   """Mixer block layer."""
20 |   tokens_mlp_dim: int
21 |   channels_mlp_dim: int
22 | 
23 |   @nn.compact
24 |   def __call__(self, x): #这一部分和上面的公式中一致
25 |     y = nn.LayerNorm()(x)
26 |     y = jnp.swapaxes(y, 1, 2)
27 |     y = MlpBlock(self.tokens_mlp_dim, name='token_mixing')(y) #token MLP
28 |     y = jnp.swapaxes(y, 1, 2)
29 |     x = x + y #残差部分
30 |     y = nn.LayerNorm()(x)
31 |     return x + MlpBlock(self.channels_mlp_dim, name='channel_mixing')(y) #channel MLP
32 | 
33 | 
34 | class MlpMixer(nn.Module):
35 |   """Mixer architecture."""
36 |   patches: Any
37 |   num_classes: int
38 |   num_blocks: int
39 |   hidden_dim: int
40 |   tokens_mlp_dim: int
41 |   channels_mlp_dim: int
42 | 
43 |   @nn.compact
44 |   def __call__(self, inputs, *, train): #把各组件搭起来
45 |     del train
46 |     x = nn.Conv(self.hidden_dim, self.patches.size,
47 |                 strides=self.patches.size, name='stem')(inputs)
48 |     x = einops.rearrange(x, 'n h w c -> n (h w) c')
49 |     for _ in range(self.num_blocks):
50 |       x = MixerBlock(self.tokens_mlp_dim, self.channels_mlp_dim)(x)
51 |     x = nn.LayerNorm(name='pre_head_layer_norm')(x)
52 |     x = jnp.mean(x, axis=1)
53 |     return nn.Dense(self.num_classes, kernel_init=nn.initializers.zeros,
54 |                     name='head')(x)
55 | 


--------------------------------------------------------------------------------
/MLP-Mixer/models_test.py:
--------------------------------------------------------------------------------
 1 | from absl.testing import absltest
 2 | from absl.testing import parameterized
 3 | 
 4 | import jax
 5 | import jax.numpy as jnp
 6 | 
 7 | from vit_jax import models
 8 | from vit_jax.configs import models as config_lib
 9 | 
10 | # 这个py可以用来测试一下模型的搭建
11 | 
12 | # 这里提供了多种Vision Transformer配置，如ViT等
13 | MODEL_SIZES = {
14 |     'ViT-B_16': 86_567_656,
15 |     'R50+ViT-B_16': 98_659_112,
16 |     'ViT-B_32': 88_224_232,
17 |     'R26+ViT-B_32': 101_383_976,
18 |     'ViT-L_16': 304_326_632,
19 |     'ViT-L_32': 306_535_400,
20 |     'R50+ViT-L_32': 328_994_856,
21 |     'ViT-H_14': 632_045_800,
22 |     'Mixer-B_16': 59_880_472,
23 |     'Mixer-L_16': 208_196_168,
24 | }
25 | 
26 | 
27 | class ModelsTest(parameterized.TestCase):
28 | 
29 |   @parameterized.parameters(*list(MODEL_SIZES.items()))
30 |   def test_can_instantiate(self, name, size):
31 |     rng = jax.random.PRNGKey(0)
32 |     config = config_lib.MODEL_CONFIGS[name] #模型配置
33 |     model_cls = models.VisionTransformer if 'ViT' in name else models.MlpMixer #调用模型
34 |     model = model_cls(num_classes=1_000, **config) #模型得到cls，用于图片分类1000
35 |     
36 |     inputs = jnp.ones([2, 224, 224, 3], jnp.float32) #随便定义一个input图片
37 |     variables = model.init(rng, inputs, train=False) #初始化
38 |     outputs = model.apply(variables, inputs, train=False) #得到模型输出
39 |     self.assertEqual((2, 1000), outputs.shape)
40 |     param_count = sum(p.size for p in jax.tree_flatten(variables)[0]) #计算参数
41 |     self.assertEqual(
42 |         size, param_count,
43 |         f'Expected {name} to have {size} params, found {param_count}.')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   absltest.main()
48 | 


--------------------------------------------------------------------------------
/MoCo/README.md:
--------------------------------------------------------------------------------
 1 | # MoCo: Momentum Contrast for Unsupervised Visual Representation Learning
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/108941999
 6 | 
 7 | # 
 8 | 
 9 | 原paper： 
10 | ```
11 | @Article{he2019moco,
12 |   author  = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick},
13 |   title   = {Momentum Contrast for Unsupervised Visual Representation Learning},
14 |   journal = {arXiv preprint arXiv:1911.05722},
15 |   year    = {2019},
16 | }
17 | ```
18 | 
19 | 原code： https://github.com/facebookresearch/moco
20 | 


--------------------------------------------------------------------------------
/NCF/Dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 17, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import scipy.sparse as sp
 7 | import numpy as np
 8 | 
 9 | #关于import库，numpy就不说了。
10 | #scipy.sparse库中提供了多种表示稀疏矩阵的格式，同时支持稀疏矩阵的加、减、乘、除和幂等。
11 | 
12 | #关于数据集，原作者使用MovieLens 1 Million (ml-1m) and Pinterest (pinterest-20).
13 | 
14 | class Dataset(object):
15 |     '''
16 |     数据集类，用于载入数据。
17 |     主要有三个操作函数：载入rating训练集，rating测试集的正例和负例。
18 |     训练集之所以没有负例是因为在模型的训练过程中过随机负采样进行训练，而测试集测试提前负采样好了之后便于公正的验证结果。
19 |     '''
20 | 
21 |     def __init__(self, path):
22 |         '''
23 |         Constructor
24 |         '''
25 |         self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
26 |         self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
27 |         self.testNegatives = self.load_negative_file(path + ".test.negative")
28 |         #需要强制测试集（positive instances正例）和负采样（negative instances负例）的大小一致
29 |         assert len(self.testRatings) == len(self.testNegatives)
30 |         
31 |         self.num_users, self.num_items = self.trainMatrix.shape
32 |         
33 |     def load_rating_file_as_list(self, filename):
34 |         '''
35 |         正例，这个载入的数据集形式为userID\t itemID\t rating\t timestamp (if have)
36 |         其中没有使用时间戳这一属性。
37 |         '''
38 |         ratingList = []
39 |         with open(filename, "r") as f:
40 |             line = f.readline()
41 |             while line != None and line != "":
42 |                 arr = line.split("\t")
43 |                 user, item = int(arr[0]), int(arr[1])#第一列和第二列分别为user和item的ID
44 |                 ratingList.append([user, item])#组合在一起放入到List中
45 |                 line = f.readline()#读下一行
46 |         return ratingList
47 |     
48 |     def load_negative_file(self, filename):
49 |         '''
50 |         负例，一个test.rating的正例对应99个负例，形式为 (userID,itemID)\t negativeItemID1\t negativeItemID2 ...
51 |         '''
52 |         negativeList = []
53 |         with open(filename, "r") as f:
54 |             line = f.readline()
55 |             while line != None and line != "":
56 |                 arr = line.split("\t")
57 |                 negatives = []
58 |                 for x in arr[1: ]:#第一列是正例的(userID,itemID)，[1:]是后面所有的负例
59 |                     negatives.append(int(x))#存该正例对应的所有负例
60 |                 negativeList.append(negatives)
61 |                 line = f.readline()#读下一行
62 |         return negativeList
63 |     
64 |     def load_rating_file_as_matrix(self, filename):
65 |         '''
66 |         读训练集，返回稀疏矩阵（dok matrix），形式为userID\t itemID\t rating\t timestamp (if have)
67 |         '''
68 |         #得到users和items的数目
69 |         num_users, num_items = 0, 0
70 |         with open(filename, "r") as f:
71 |             line = f.readline()
72 |             while line != None and line != "":
73 |                 arr = line.split("\t")
74 |                 u, i = int(arr[0]), int(arr[1])
75 |                 #最大的id即是数目
76 |                 num_users = max(num_users, u)
77 |                 num_items = max(num_items, i)
78 |                 line = f.readline()
79 |         #构建矩阵
80 |         #使用dok(Dictionary Of Keys based sparse matrix)构建稀疏矩阵。使用字典保存非0值元素的(行，列)。
81 |         mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
82 |         with open(filename, "r") as f:
83 |             line = f.readline()
84 |             while line != None and line != "":
85 |                 arr = line.split("\t")
86 |                 user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
87 |                 #评分大于0即认为是正例。ml中是(0,5,0.5)的分数，pinterest是用户'pin'或者'no pin'（即0-1）的lable。
88 |                 if (rating > 0):
89 |                     mat[user, item] = 1.0
90 |                 line = f.readline()    
91 |         return mat
92 | 


--------------------------------------------------------------------------------
/NCF/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Collaborative Filtering(NCF)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/85123127
 6 | 
 7 | 建议阅读顺序：Dataset-->GMF-->evaluate-->MLP-->NeuMF
 8 | 
 9 | #   
10 | 
11 | 原paper：
12 | Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua (2017). Neural Collaborative Filtering. In Proceedings of WWW '17, Perth, Australia, April 03-07, 2017.
13 | 
14 | 原code：
15 | https://github.com/hexiangnan/neural_collaborative_filtering
16 | 


--------------------------------------------------------------------------------
/NCF/evaluate.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 17, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import math
 7 | import heapq #优先队列堆排最后的topK结果
 8 | import multiprocessing
 9 | import numpy as np
10 | from time import time
11 | #numba是一个用于编译Python数组和数值计算函数的编译器
12 | #from numba import jit, autojit
13 | 
14 | #全局变量。main只调用evaluate_model，其他的函数都在evaluate_model中被调用。
15 | _model = None
16 | _testRatings = None
17 | _testNegatives = None
18 | _K = None
19 | 
20 | def evaluate_model(model, testRatings, testNegatives, K, num_thread):
21 |     """
22 |     使用多线程处理得到 (Hit_Ratio, NDCG) 的评价指标分数。
23 |     """
24 |     global _model
25 |     global _testRatings
26 |     global _testNegatives
27 |     global _K
28 |     _model = model
29 |     _testRatings = testRatings
30 |     _testNegatives = testNegatives
31 |     _K = K
32 |         
33 |     hits, ndcgs = [],[]
34 |     if(num_thread > 1): #多线程
35 |         pool = multiprocessing.Pool(processes=num_thread)
36 |         res = pool.map(eval_one_rating, range(len(_testRatings)))#map函数传入参数列表（所有的ID）给eval_one_rating
37 |         pool.close()
38 |         pool.join()#进程终结后要调用wait（join等同于wait，避免其成为僵尸进程）
39 |         hits = [r[0] for r in res]
40 |         ndcgs = [r[1] for r in res]
41 |         return (hits, ndcgs)
42 |     #单线程
43 |     #xrange生成器，生成一个ID取出一个，不是一次生成整个列表，能优化内存。
44 |     for idx in xrange(len(_testRatings)):
45 |         (hr,ndcg) = eval_one_rating(idx)#对每个用户计算评价指标
46 |         hits.append(hr)
47 |         ndcgs.append(ndcg)      
48 |     return (hits, ndcgs)
49 | 
50 | def eval_one_rating(idx):
51 |     '''
52 |     对单个用户用model预测，再topk计算评价指标。由于重要参数全局之后，只需要传入idx，同样是对性能的优化
53 |     '''
54 |     #根据用户id，得到相应的item
55 |     rating = _testRatings[idx] #正例对[user, item]
56 |     items = _testNegatives[idx] #负例
57 |     u = rating[0]
58 |     gtItem = rating[1]
59 |     items.append(gtItem)#把正例也放到负例列表中，方便计算。
60 |     #利用model预测分数
61 |     map_item_score = {}#存储该user对所有item的分数
62 |     users = np.full(len(items), u, dtype = 'int32')#填充user矩阵，即[u,u,u,u,u,u...]，一一对应user-item对放入model得到分数
63 |     predictions = _model.predict([users, np.array(items)], 
64 |                                  batch_size=100, verbose=0)
65 |     for i in xrange(len(items)):
66 |         item = items[i]
67 |         map_item_score[item] = predictions[i]#填充预测分数
68 |     items.pop()#pop掉正例（感觉用不上...topk也是对map_item_score进行的）
69 |     
70 |     #检索topk
71 |     #堆排速度快，能快速按照score得到topk的ID。
72 |     ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
73 |     #然后计算两个评估指标
74 |     hr = getHitRatio(ranklist, gtItem)
75 |     ndcg = getNDCG(ranklist, gtItem)
76 |     return (hr, ndcg)
77 | 
78 | def getHitRatio(ranklist, gtItem):
79 |     #HR击中率，如果topk中有正例ID即认为正确
80 |     for item in ranklist:
81 |         if item == gtItem:
82 |             return 1
83 |     return 0
84 | 
85 | def getNDCG(ranklist, gtItem):
86 |     #NDCG归一化折损累计增益
87 |     for i in xrange(len(ranklist)):
88 |         item = ranklist[i]
89 |         if item == gtItem:
90 |             return math.log(2) / math.log(i+2)
91 |     return 0
92 | 


--------------------------------------------------------------------------------
/NFM/README.md:
--------------------------------------------------------------------------------
 1 | # Neural Factorization Machines
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/84958683
 6 | 
 7 | 建议阅读顺序：FM-->NeuralFM
 8 | 
 9 | # 
10 | 
11 | 原paper： Xiangnan He and Tat-Seng Chua (2017). Neural Factorization Machines for Sparse Predictive Analytics. In Proceedings of SIGIR '17, Shinjuku, Tokyo, Japan, August 07-11, 2017.
12 | 
13 | 原code： https://github.com/hexiangnan/neural_factorization_machine
14 | 


--------------------------------------------------------------------------------
/NGCF/READEME.md:
--------------------------------------------------------------------------------
 1 | # Neural Graph Collaborative Filtering(NGCF)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/106970194
 6 | 
 7 | #
 8 | 原paper： Xiang Wang, Xiangnan He, Meng Wang, Fuli Feng, and Tat-Seng Chua (2019). Neural Graph Collaborative Filtering, Paper in ACM DL or Paper in arXiv. In SIGIR'19, Paris, France, July 21-25, 2019.
 9 | 
10 | 原code： https://github.com/xiangwang1223/neural_graph_collaborative_filtering
11 | 


--------------------------------------------------------------------------------
/NTM/GSM_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | '''
 4 | Created on Jun 23, 2021
 5 | @author: nakaizura
 6 | '''
 7 | 
 8 | # NVDM-GSM是基于VAE的一种神经主题模型，模型代码的架构和VAE是类似的。
 9 | # 输入词袋，然后建模分布，然后采样计算VAE的两大损失。
10 | 
11 | import os
12 | import re
13 | import torch
14 | import pickle
15 | import argparse
16 | import logging
17 | import time
18 | from models import GSM
19 | from utils import *
20 | from dataset import DocDataset
21 | from multiprocessing import cpu_count
22 | #from torch.utils.data import Dataset,DataLoader
23 | 
24 | parser = argparse.ArgumentParser('GSM topic model')
25 | parser.add_argument('--taskname',type=str,default='cnews10k',help='Taskname e.g cnews10k')
26 | parser.add_argument('--no_below',type=int,default=5,help='The lower bound of count for words to keep, e.g 10')
27 | parser.add_argument('--no_above',type=float,default=0.005,help='The ratio of upper bound of count for words to keep, e.g 0.3')
28 | parser.add_argument('--num_epochs',type=int,default=10,help='Number of iterations (set to 100 as default, but 1000+ is recommended.)')
29 | parser.add_argument('--n_topic',type=int,default=20,help='Num of topics')
30 | parser.add_argument('--bkpt_continue',type=bool,default=False,help='Whether to load a trained model as initialization and continue training.')
31 | parser.add_argument('--use_tfidf',type=bool,default=False,help='Whether to use the tfidf feature for the BOW input')
32 | parser.add_argument('--rebuild',action='store_true',help='Whether to rebuild the corpus, such as tokenization, build dict etc.(default False)')
33 | parser.add_argument('--batch_size',type=int,default=512,help='Batch size (default=512)')
34 | parser.add_argument('--criterion',type=str,default='cross_entropy',help='The criterion to calculate the loss, e.g cross_entropy, bce_softmax, bce_sigmoid')
35 | parser.add_argument('--auto_adj',action='store_true',help='To adjust the no_above ratio automatically (default:rm top 20)')
36 | parser.add_argument('--ckpt',type=str,default=None,help='Checkpoint path')
37 | 
38 | args = parser.parse_args() #载入参数
39 | 
40 | def main():
41 |     global args
42 |     taskname = args.taskname # 数据集名字
43 |     no_below = args.no_below # 文档频率小于阈值的词会被过滤掉
44 |     no_above = args.no_above # 文档频率小于阈值的词将被过滤掉
45 |     num_epochs = args.num_epochs # 训练周期
46 |     n_topic = args.n_topic # 主题数
47 |     n_cpu = cpu_count()-2 if cpu_count()>2 else 2
48 |     bkpt_continue = args.bkpt_continue # 是否在之前的checkoint上继续训练
49 |     use_tfidf = args.use_tfidf # 是否用tfidf作为BOW输入
50 |     rebuild = args.rebuild # 是否重建语料，默认不会
51 |     batch_size = args.batch_size # 批次大小
52 |     criterion = args.criterion # loss的种类
53 |     auto_adj = args.auto_adj # 是否自动调整频率，如去掉top20
54 |     ckpt = args.ckpt # ckpt路径
55 | 
56 |     device = torch.device('cpu')
57 |     docSet = DocDataset(taskname,no_below=no_below,no_above=no_above,rebuild=rebuild,use_tfidf=False)# 载入数据集，并分词
58 |     if auto_adj:
59 |         no_above = docSet.topk_dfs(topk=20)
60 |         docSet = DocDataset(taskname,no_below=no_below,no_above=no_above,rebuild=rebuild,use_tfidf=False)
61 |     
62 |     voc_size = docSet.vocabsize
63 |     print('voc size:',voc_size)
64 | 
65 |     if ckpt:# 载入ckpt
66 |         checkpoint=torch.load(ckpt)
67 |         param.update({"device": device})
68 |         model = GSM(**param)
69 |         model.train(train_data=docSet,batch_size=batch_size,test_data=docSet,num_epochs=num_epochs,log_every=10,beta=1.0,criterion=criterion,ckpt=checkpoint)
70 |     else:
71 |         # 初始化模型并开始执行train程序
72 |         model = GSM(bow_dim=voc_size,n_topic=n_topic,taskname=taskname,device=device)
73 |         model.train(train_data=docSet,batch_size=batch_size,test_data=docSet,num_epochs=num_epochs,log_every=10,beta=1.0,criterion=criterion)
74 |     model.evaluate(test_data=docSet)# 用训练之后的模型做评估
75 |     # 存模型，特征，统计等等结果
76 |     save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
77 |     torch.save(model.vae.state_dict(),save_name)
78 |     txt_lst, embeds = model.get_embed(train_data=docSet, num=1000)
79 |     with open('topic_dist_gsm.txt','w',encoding='utf-8') as wfp:
80 |         for t,e in zip(txt_lst,embeds):
81 |             wfp.write(f'{e}:{t}\n')
82 |     pickle.dump({'txts':txt_lst,'embeds':embeds},open('gsm_embeds.pkl','wb'))
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/NTM/README.md:
--------------------------------------------------------------------------------
 1 | # Discovering Discrete Latent Topics with Neural Variational Inference(NVDM-GSM)
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | NVDM-GSM神经主题模型的经典实现，主要基于VAE来实现。
 5 | 
 6 | #
 7 | 原code实现了多种主题模型，而且相当轻便又好用！  
 8 | 
 9 | 原code：https://github.com/zll17/Neural_Topic_Models
10 | 


--------------------------------------------------------------------------------
/NTM/vae.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | '''
 4 | Created on Jun 23, 2021
 5 | @author: nakaizura
 6 | '''
 7 | 
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | 
13 | # VAE model
14 | # 输入，建模分布的mu和var，采样得到向量，然后重建+KL约束
15 | class VAE(nn.Module):
16 |     def __init__(self, encode_dims=[2000,1024,512,20],decode_dims=[20,1024,2000],dropout=0.0):
17 | 
18 |         super(VAE, self).__init__()
19 |         self.encoder = nn.ModuleDict({
20 |             f'enc_{i}':nn.Linear(encode_dims[i],encode_dims[i+1]) 
21 |             for i in range(len(encode_dims)-2)
22 |         })
23 |         self.fc_mu = nn.Linear(encode_dims[-2],encode_dims[-1]) # 学习mu和var
24 |         self.fc_logvar = nn.Linear(encode_dims[-2],encode_dims[-1])
25 | 
26 |         self.decoder = nn.ModuleDict({
27 |             f'dec_{i}':nn.Linear(decode_dims[i],decode_dims[i+1])
28 |             for i in range(len(decode_dims)-1)
29 |         })
30 |         self.latent_dim = encode_dims[-1]
31 |         self.dropout = nn.Dropout(p=dropout)
32 |         self.fc1 = nn.Linear(encode_dims[-1],encode_dims[-1])
33 |         
34 |         
35 |     def encode(self, x):# 编码
36 |         hid = x
37 |         for i,layer in self.encoder.items():# 多层fc
38 |             hid = F.relu(self.dropout(layer(hid)))
39 |         mu, log_var = self.fc_mu(hid), self.fc_logvar(hid)# 得到mu和var
40 |         return mu, log_var
41 | 
42 |     def inference(self,x):# 推断
43 |         mu, log_var = self.encode(x)# 得到分布
44 |         theta = torch.softmax(x,dim=1)# 得到向量
45 |         return theta
46 |     
47 |     def reparameterize(self, mu, log_var):# 重参数技巧，使训练可微
48 |         std = torch.exp(log_var/2)
49 |         eps = torch.randn_like(std)# 采样
50 |         z = mu + eps * std
51 |         return z
52 | 
53 |     def decode(self, z):# 解码
54 |         hid = z
55 |         for i,(_,layer) in enumerate(self.decoder.items()):# 多层fc
56 |             hid = layer(hid)
57 |             if i<len(self.decoder)-1:
58 |                 hid = F.relu(self.dropout(hid))
59 |         return hid
60 |     
61 |     def forward(self, x, collate_fn=None):
62 |         mu, log_var = self.encode(x)# 得到分布的mu和var
63 |         _theta = self.reparameterize(mu, log_var)# 重参数采样得到向量
64 |         _theta = self.fc1(_theta) 
65 |         if collate_fn!=None:
66 |             theta = collate_fn(_theta)
67 |         else:
68 |             theta = _theta
69 |         x_reconst = self.decode(theta)# 重建loss
70 |         return x_reconst, mu, log_var # 返回重建和两个分布参数，KL散度在模型中计算，不在此处
71 | 
72 | if __name__ == '__main__':
73 |     model = VAE(encode_dims=[1024,512,256,20],decode_dims=[20,128,768,1024])
74 |     model = model.cuda()
75 |     inpt = torch.randn(234,1024).cuda()
76 |     out,mu,log_var = model(inpt)
77 |     print(out.shape)
78 |     print(mu.shape)
79 | 


--------------------------------------------------------------------------------
/Non-local/README.md:
--------------------------------------------------------------------------------
 1 | # Non-local Neural Networks(Non-local)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/104951012
 6 | 
 7 | 建议阅读顺序：network.py-->non-local-embedded_gaussian.py-->non-local_concatenation.py-->non-local-dot_product.py-->non-local-gaussian.py（后面三个都是embedded_gaussian的变体，计算Attention有点不一样）
 8 | 
 9 | #
10 | 
11 | 原paper： Non-local Neural Networks
12 | 
13 | 原code： https://github.com/AlexHex7/Non-local_pytorch
14 | 


--------------------------------------------------------------------------------
/Non-local/network.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 3, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | from torch import nn
 7 | # from lib.non_local_concatenation import NONLocalBlock2D
 8 | # from lib.non_local_gaussian import NONLocalBlock2D
 9 | from lib.non_local_embedded_gaussian import NONLocalBlock2D
10 | # from lib.non_local_dot_product import NONLocalBlock2D
11 | 
12 | 
13 | #non-local的总框架
14 | class Network(nn.Module):
15 |     def __init__(self):
16 |         super(Network, self).__init__()
17 | 
18 |         self.convs = nn.Sequential(
19 |             nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1),
20 |             nn.BatchNorm2d(32),
21 |             nn.ReLU(),
22 |             nn.MaxPool2d(2),
23 | 
24 |             #插入了NONLocal的模块，其他的与普通分类网络差不多
25 |             NONLocalBlock2D(in_channels=32),
26 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
27 |             nn.BatchNorm2d(64),
28 |             nn.ReLU(),
29 |             nn.MaxPool2d(2),
30 | 
31 |             NONLocalBlock2D(in_channels=64),
32 |             nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
33 |             nn.BatchNorm2d(128),
34 |             nn.ReLU(),
35 |             nn.MaxPool2d(2),
36 |         )
37 | 
38 |         self.fc = nn.Sequential(
39 |             nn.Linear(in_features=128*3*3, out_features=256),
40 |             nn.ReLU(),
41 |             nn.Dropout(0.5),
42 | 
43 |             nn.Linear(in_features=256, out_features=10)
44 |         )
45 | 
46 |     def forward(self, x):
47 |         batch_size = x.size(0)
48 |         output = self.convs(x).view(batch_size, -1)
49 |         output = self.fc(output)
50 |         return output
51 | 
52 | if __name__ == '__main__':
53 |     import torch
54 | 
55 |     img = torch.randn(3, 1, 28, 28)#输入的图像
56 |     net = Network()
57 |     out = net(img)
58 |     print(out.size())
59 | 


--------------------------------------------------------------------------------
/Non-local/non_local_dot_product.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 3, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | #与embedded_gaussian的区别是，点乘之后没有用softmax（直接看forward）
 11 | 
 12 | class _NonLocalBlockND(nn.Module):
 13 |     def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
 14 |         super(_NonLocalBlockND, self).__init__()
 15 | 
 16 |         assert dimension in [1, 2, 3]
 17 | 
 18 |         self.dimension = dimension
 19 |         self.sub_sample = sub_sample
 20 | 
 21 |         self.in_channels = in_channels
 22 |         self.inter_channels = inter_channels
 23 | 
 24 |         if self.inter_channels is None:
 25 |             self.inter_channels = in_channels // 2
 26 |             if self.inter_channels == 0:
 27 |                 self.inter_channels = 1
 28 | 
 29 |         if dimension == 3:
 30 |             conv_nd = nn.Conv3d
 31 |             max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
 32 |             bn = nn.BatchNorm3d
 33 |         elif dimension == 2:
 34 |             conv_nd = nn.Conv2d
 35 |             max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
 36 |             bn = nn.BatchNorm2d
 37 |         else:
 38 |             conv_nd = nn.Conv1d
 39 |             max_pool_layer = nn.MaxPool1d(kernel_size=(2))
 40 |             bn = nn.BatchNorm1d
 41 | 
 42 |         self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
 43 |                          kernel_size=1, stride=1, padding=0)
 44 | 
 45 |         if bn_layer:
 46 |             self.W = nn.Sequential(
 47 |                 conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
 48 |                         kernel_size=1, stride=1, padding=0),
 49 |                 bn(self.in_channels)
 50 |             )
 51 |             nn.init.constant_(self.W[1].weight, 0)
 52 |             nn.init.constant_(self.W[1].bias, 0)
 53 |         else:
 54 |             self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
 55 |                              kernel_size=1, stride=1, padding=0)
 56 |             nn.init.constant_(self.W.weight, 0)
 57 |             nn.init.constant_(self.W.bias, 0)
 58 | 
 59 |         self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
 60 |                              kernel_size=1, stride=1, padding=0)
 61 | 
 62 |         self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
 63 |                            kernel_size=1, stride=1, padding=0)
 64 | 
 65 |         if sub_sample:
 66 |             self.g = nn.Sequential(self.g, max_pool_layer)
 67 |             self.phi = nn.Sequential(self.phi, max_pool_layer)
 68 | 
 69 |     def forward(self, x):
 70 |         '''
 71 |         :param x: (b, c, t, h, w)
 72 |         :return:
 73 |         '''
 74 | 
 75 |         batch_size = x.size(0)
 76 | 
 77 |         #代表注意力V，Q，K的g，theta，phi都要做同样的嵌入操作
 78 |         g_x = self.g(x).view(batch_size, self.inter_channels, -1)
 79 |         g_x = g_x.permute(0, 2, 1)
 80 | 
 81 |         theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
 82 |         theta_x = theta_x.permute(0, 2, 1)
 83 |         phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
 84 |         f = torch.matmul(theta_x, phi_x)#然后theta_x, phi_x做内积，算像素相似度
 85 |         N = f.size(-1)
 86 |         f_div_C = f / N #不是softmax，直接除维度
 87 | 
 88 |         #其他的地方一样
 89 |         y = torch.matmul(f_div_C, g_x)
 90 |         y = y.permute(0, 2, 1).contiguous()
 91 |         y = y.view(batch_size, self.inter_channels, *x.size()[2:])
 92 |         W_y = self.W(y)
 93 |         z = W_y + x
 94 | 
 95 |         return z
 96 | 
 97 | 
 98 | class NONLocalBlock1D(_NonLocalBlockND):
 99 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
100 |         super(NONLocalBlock1D, self).__init__(in_channels,
101 |                                               inter_channels=inter_channels,
102 |                                               dimension=1, sub_sample=sub_sample,
103 |                                               bn_layer=bn_layer)
104 | 
105 | 
106 | class NONLocalBlock2D(_NonLocalBlockND):
107 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
108 |         super(NONLocalBlock2D, self).__init__(in_channels,
109 |                                               inter_channels=inter_channels,
110 |                                               dimension=2, sub_sample=sub_sample,
111 |                                               bn_layer=bn_layer)
112 | 
113 | 
114 | class NONLocalBlock3D(_NonLocalBlockND):
115 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
116 |         super(NONLocalBlock3D, self).__init__(in_channels,
117 |                                               inter_channels=inter_channels,
118 |                                               dimension=3, sub_sample=sub_sample,
119 |                                               bn_layer=bn_layer)
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     import torch
124 | 
125 |     for (sub_sample, bn_layer) in [(True, True), (False, False), (True, False), (False, True)]:
126 |         img = torch.zeros(2, 3, 20)
127 |         net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
128 |         out = net(img)
129 |         print(out.size())
130 | 
131 |         img = torch.zeros(2, 3, 20, 20)
132 |         net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
133 |         out = net(img)
134 |         print(out.size())
135 | 
136 |         img = torch.randn(2, 3, 8, 20, 20)
137 |         net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
138 |         out = net(img)
139 |         print(out.size())
140 | 


--------------------------------------------------------------------------------
/Non-local/non_local_gaussian.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 3, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | from torch.nn import functional as F
  9 | 
 10 | 
 11 | #与embedded_gaussian的区别是，1x1变max pooling（直接看forward）
 12 | 
 13 | class _NonLocalBlockND(nn.Module):
 14 |     def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
 15 |         super(_NonLocalBlockND, self).__init__()
 16 | 
 17 |         assert dimension in [1, 2, 3]
 18 | 
 19 |         self.dimension = dimension
 20 |         self.sub_sample = sub_sample
 21 | 
 22 |         self.in_channels = in_channels
 23 |         self.inter_channels = inter_channels
 24 | 
 25 |         if self.inter_channels is None:
 26 |             self.inter_channels = in_channels // 2
 27 |             if self.inter_channels == 0:
 28 |                 self.inter_channels = 1
 29 | 
 30 |         if dimension == 3:
 31 |             conv_nd = nn.Conv3d
 32 |             max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
 33 |             bn = nn.BatchNorm3d
 34 |         elif dimension == 2:
 35 |             conv_nd = nn.Conv2d
 36 |             max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
 37 |             bn = nn.BatchNorm2d
 38 |         else:
 39 |             conv_nd = nn.Conv1d
 40 |             max_pool_layer = nn.MaxPool1d(kernel_size=(2))
 41 |             bn = nn.BatchNorm1d
 42 | 
 43 |         self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
 44 |                          kernel_size=1, stride=1, padding=0)
 45 | 
 46 |         if bn_layer:
 47 |             self.W = nn.Sequential(
 48 |                 conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
 49 |                         kernel_size=1, stride=1, padding=0),
 50 |                 bn(self.in_channels)
 51 |             )
 52 |             nn.init.constant_(self.W[1].weight, 0)
 53 |             nn.init.constant_(self.W[1].bias, 0)
 54 |         else:
 55 |             self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
 56 |                              kernel_size=1, stride=1, padding=0)
 57 |             nn.init.constant_(self.W.weight, 0)
 58 |             nn.init.constant_(self.W.bias, 0)
 59 | 
 60 |         if sub_sample:
 61 |             self.g = nn.Sequential(self.g, max_pool_layer)
 62 |             self.phi = max_pool_layer
 63 | 
 64 |     def forward(self, x):
 65 |         '''
 66 |         :param x: (b, c, t, h, w)
 67 |         :return:
 68 |         '''
 69 | 
 70 |         batch_size = x.size(0)
 71 | 
 72 |         #代表注意力V，Q，K的g，theta，phi
 73 |         g_x = self.g(x).view(batch_size, self.inter_channels, -1)
 74 |         g_x = g_x.permute(0, 2, 1)
 75 | 
 76 |         #差别在这里，Q，K都没有嵌入，然后算点积
 77 |         theta_x = x.view(batch_size, self.in_channels, -1)
 78 |         theta_x = theta_x.permute(0, 2, 1)
 79 | 
 80 |         
 81 |         if self.sub_sample:#phi变成max_pool_layer
 82 |             phi_x = self.phi(x).view(batch_size, self.in_channels, -1)
 83 |         else:
 84 |             phi_x = x.view(batch_size, self.in_channels, -1)
 85 | 
 86 |         #其他一致
 87 |         f = torch.matmul(theta_x, phi_x)
 88 |         f_div_C = F.softmax(f, dim=-1)
 89 | 
 90 |         y = torch.matmul(f_div_C, g_x)
 91 |         y = y.permute(0, 2, 1).contiguous()
 92 |         y = y.view(batch_size, self.inter_channels, *x.size()[2:])
 93 |         W_y = self.W(y)
 94 |         z = W_y + x
 95 | 
 96 |         return z
 97 | 
 98 | 
 99 | class NONLocalBlock1D(_NonLocalBlockND):
100 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
101 |         super(NONLocalBlock1D, self).__init__(in_channels,
102 |                                               inter_channels=inter_channels,
103 |                                               dimension=1, sub_sample=sub_sample,
104 |                                               bn_layer=bn_layer)
105 | 
106 | 
107 | class NONLocalBlock2D(_NonLocalBlockND):
108 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
109 |         super(NONLocalBlock2D, self).__init__(in_channels,
110 |                                               inter_channels=inter_channels,
111 |                                               dimension=2, sub_sample=sub_sample,
112 |                                               bn_layer=bn_layer)
113 | 
114 | 
115 | class NONLocalBlock3D(_NonLocalBlockND):
116 |     def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
117 |         super(NONLocalBlock3D, self).__init__(in_channels,
118 |                                               inter_channels=inter_channels,
119 |                                               dimension=3, sub_sample=sub_sample,
120 |                                               bn_layer=bn_layer)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     import torch
125 | 
126 |     for (sub_sample, bn_layer) in [(True, True), (False, False), (True, False), (False, True)]:
127 |         img = torch.zeros(2, 3, 20)
128 |         net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
129 |         out = net(img)
130 |         print(out.size())
131 | 
132 |         img = torch.zeros(2, 3, 20, 20)
133 |         net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
134 |         out = net(img)
135 |         print(out.size())
136 | 
137 |         img = torch.randn(2, 3, 8, 20, 20)
138 |         net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
139 |         out = net(img)
140 |         print(out.size())
141 | 


--------------------------------------------------------------------------------
/ONCF/Dataset.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 19, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import scipy.sparse as sp
  7 | import numpy as np
  8 | from time import time
  9 | 
 10 | #scipy.sparse库中提供了多种表示稀疏矩阵的格式，同时支持稀疏矩阵的加、减、乘、除和幂等。
 11 | #和NCF处理数据的方式很像
 12 | 
 13 | 
 14 | class Dataset(object):
 15 |     '''
 16 |     数据集类，用于载入数据。
 17 |         trainMatrix: load rating records as sparse matrix for class Data
 18 |         trianList: load rating records as list to speed up user's feature retrieval
 19 |         testRatings: load leave-one-out rating test for class Evaluate
 20 |         testNegatives: sample the items not rated by user
 21 |     '''
 22 | 
 23 |     def __init__(self, path):
 24 |         '''
 25 |         Constructor
 26 |         '''
 27 |         self.trainMatrix = self.load_training_file_as_matrix(path + ".train.rating")
 28 |         self.trainList = self.load_training_file_as_list(path + ".train.rating")
 29 |         self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
 30 |         self.testNegatives = self.load_negative_file(path + ".test.negative")
 31 |         #需要强制测试集（positive instances正例）和负采样（negative instances负例）的大小一致
 32 |         assert len(self.testRatings) == len(self.testNegatives)
 33 |         self.num_users, self.num_items = self.trainMatrix.shape
 34 | 
 35 |     def load_rating_file_as_list(self, filename):
 36 |         '''
 37 |         正例，这个载入的数据集形式为userID\t itemID\t rating\t timestamp (if have)
 38 |         其中没有使用时间戳这一属性。
 39 |         '''
 40 |         ratingList = []
 41 |         with open(filename, "r") as f:
 42 |             line = f.readline()
 43 |             while line != None and line != "":
 44 |                 arr = line.split("\t")
 45 |                 user, item = int(arr[0]), int(arr[1])#第一列和第二列分别为user和item的ID
 46 |                 ratingList.append([user, item])#组合在一起放入到List中
 47 |                 line = f.readline()#读下一行
 48 |         return ratingList
 49 | 
 50 |     def load_negative_file(self, filename):
 51 |         '''
 52 |         负例，一个test.rating的正例对应999个负例，形式为 (userID,itemID)\t negativeItemID1\t negativeItemID2 ...
 53 |         '''
 54 |         negativeList = []
 55 |         with open(filename, "r") as f:
 56 |             line = f.readline()
 57 |             while line != None and line != "":
 58 |                 arr = line.split("\t")
 59 |                 negatives = []
 60 |                 for x in arr[1: ]:#第一列是正例的(userID,itemID)，[1:]是后面所有的负例
 61 |                     negatives.append(int(x))#存该正例对应的所有负例
 62 |                 negativeList.append(negatives)
 63 |                 line = f.readline()#读下一行
 64 |         return negativeList
 65 | 
 66 |     def load_training_file_as_matrix(self, filename):
 67 |         '''
 68 |         读训练集，返回稀疏矩阵（dok matrix），形式为userID\t itemID\t rating\t timestamp (if have)
 69 |         '''
 70 |         #得到users和items的数目
 71 |         num_users, num_items = 0, 0
 72 |         with open(filename, "r") as f:
 73 |             line = f.readline()
 74 |             while line != None and line != "":
 75 |                 arr = line.split("\t")
 76 |                 u, i = int(arr[0]), int(arr[1])
 77 |                 #最大的id即是数目
 78 |                 num_users = max(num_users, u)
 79 |                 num_items = max(num_items, i)
 80 |                 line = f.readline()
 81 |         #构建矩阵
 82 |         #使用dok(Dictionary Of Keys based sparse matrix)构建稀疏矩阵。使用字典保存非0值元素的(行，列)。
 83 |         mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
 84 |         with open(filename, "r") as f:
 85 |             line = f.readline()
 86 |             while line != None and line != "":
 87 |                 arr = line.split("\t")
 88 |                 user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
 89 |                 #评分大于0即认为是正例。
 90 |                 if (rating > 0):
 91 |                     mat[user, item] = 1.0
 92 |                 line = f.readline() 
 93 |         print ("already load the trainMatrix...")
 94 |         return mat
 95 | 
 96 |     def load_training_file_as_list(self, filename):
 97 |         #得到users和items的数量
 98 |         u_ = 0
 99 |         lists, items = [], []
100 |         with open(filename, "r") as f:
101 |             line = f.readline()
102 |             index = 0
103 |             while line != None and line != "":
104 |                 arr = line.split("\t")
105 |                 u, i = int(arr[0]), int(arr[1])
106 |                 if u_ < u:
107 |                     index = 0
108 |                     lists.append(items)
109 |                     items = []
110 |                     u_ += 1
111 |                 index += 1
112 |                 #if index<300:
113 |                 items.append(i)
114 |                 line = f.readline()
115 |         lists.append(items)
116 |         print ("already load the trainList...")
117 |         return lists
118 | 


--------------------------------------------------------------------------------
/ONCF/README.md:
--------------------------------------------------------------------------------
 1 | # Outer Product-based Neural Collaborative Filtering(ONCF)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/85209033
 6 | 
 7 | 建议阅读顺序：Dataset-->MF_BPR-->saver-->ConvNCF
 8 | 
 9 | # 
10 | 
11 | 原paper： Xiangnan He, Xiaoyu Du, Xiang Wang, Feng Tian, Jinhui Tang, Tat-Seng Chua, Outer Product-based Neural Collaborative Filtering, In Proceedings of IJCAI'18.
12 | 
13 | 原code： https://github.com/duxy-me/ConvNCF
14 | 


--------------------------------------------------------------------------------
/ONCF/saver.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 19, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | #saver用于保存嵌入参数P和Q。
 7 | 
 8 | import numpy as np
 9 | 
10 | class Saver(object):
11 |     def save(self, model, sess):
12 |         Null
13 | 
14 | class MFSaver(Saver):
15 |     def __init__(self):
16 |         self.prefix = None
17 | 
18 |     def setPrefix(self, prefix = None):
19 |         self.prefix = prefix
20 | 
21 |     def save(self, model, sess):
22 |         if self.prefix == None:
23 |             print ("prefix should be set by GMFSaver.setPrefix(prefix)")
24 |             return
25 | 
26 |         params = sess.run([model.embedding_P, model.embedding_Q])
27 |         print ('saving model.embedding_P', params[0].shape, ', model.embedding_Q', params[1].shape,\
28 |               ' to', self.prefix, "_*.txt")
29 | 
30 |         f = open(self.prefix + "_P.txt", 'w')
31 |         np.savetxt(f, params[0])
32 |         f.close()
33 | 
34 |         f = open(self.prefix + "_Q.txt", 'w')
35 |         np.savetxt(f, params[1])
36 |         f.close()
37 | 
38 | 
39 | class GMFSaver(Saver):
40 |     def __init__(self):
41 |         self.prefix = None
42 | 
43 |     def setPrefix(self, prefix = None):
44 |         self.prefix = prefix
45 | 


--------------------------------------------------------------------------------
/OpenPrompt/README.md:
--------------------------------------------------------------------------------
 1 | # OpenPrompt
 2 | 清华推出了prompt-tuning工具包，相当好用。
 3 | 
 4 | 博主自己在学习的时候做了一些注释，放在这里。
 5 | 
 6 | # 
 7 | 
 8 | prompt相关论文和前沿进展在欢迎访问博客：https://nakaizura.blog.csdn.net/article/details/107898237
 9 | 
10 | # 
11 | 原项目：https://github.com/thunlp/OpenPrompt
12 | 


--------------------------------------------------------------------------------
/RippleNet/README.md:
--------------------------------------------------------------------------------
 1 | # RippleNet: Propagating User Preferences on the Knowledge Graph for Recommender Systems
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/106430685
 6 | 
 7 | 建议阅读顺序：preprocess-->data_loader-->model-->train-->main
 8 | 
 9 | #
10 | 
11 | 原paper： Hongwei Wang, Fuzheng Zhang, Jialin Wang, Miao Zhao, Wenjie Li, Xing Xie, Minyi Guo
12 | 
13 | 原code： https://github.com/hwwang55/RippleNet
14 | 


--------------------------------------------------------------------------------
/RippleNet/data_loader.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on May 29, 2020
  3 | @author: nakaizura
  4 | '''
  5 | 
  6 | import collections
  7 | import os
  8 | import numpy as np
  9 | 
 10 | #按照preprocess的处理结果得到两个文件kg_final.txt和rating_final.txt
 11 | #KG的数据集格式是三元组：h，r，t
 12 | #推荐的评分数据集格式是：userid，itemid，rating（只有0和1，正例or负例）
 13 | 
 14 | def load_data(args):
 15 |     train_data, eval_data, test_data, user_history_dict = load_rating(args)
 16 |     n_entity, n_relation, kg = load_kg(args)
 17 |     ripple_set = get_ripple_set(args, kg, user_history_dict)
 18 |     return train_data, eval_data, test_data, n_entity, n_relation, ripple_set
 19 | 
 20 | 
 21 | def load_rating(args):
 22 |     print('reading rating file ...')
 23 | 
 24 |     #载入评分数据集
 25 |     rating_file = '../data/' + args.dataset + '/ratings_final'
 26 |     if os.path.exists(rating_file + '.npy'):
 27 |         rating_np = np.load(rating_file + '.npy')
 28 |     else:
 29 |         rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32)
 30 |         np.save(rating_file + '.npy', rating_np)
 31 | 
 32 |     # n_user = len(set(rating_np[:, 0]))
 33 |     # n_item = len(set(rating_np[:, 1]))
 34 |     return dataset_split(rating_np)#分割数据集6：2：2
 35 | 
 36 | 
 37 | def dataset_split(rating_np):
 38 |     print('splitting dataset ...')
 39 | 
 40 |     # train:eval:test = 6:2:2
 41 |     eval_ratio = 0.2
 42 |     test_ratio = 0.2
 43 |     n_ratings = rating_np.shape[0]
 44 | 
 45 |     # 6：2：2是随机选择切分的
 46 |     eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False)
 47 |     left = set(range(n_ratings)) - set(eval_indices)
 48 |     test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
 49 |     train_indices = list(left - set(test_indices))
 50 |     # print(len(train_indices), len(eval_indices), len(test_indices))
 51 | 
 52 |     #只保留训练集中有历史评分的用户
 53 |     user_history_dict = dict()
 54 |     for i in train_indices:#遍历训练集
 55 |         #数据集的格式是userid，itemid，rating
 56 |         user = rating_np[i][0]
 57 |         item = rating_np[i][1]
 58 |         rating = rating_np[i][2]
 59 |         if rating == 1:
 60 |             if user not in user_history_dict:
 61 |                 user_history_dict[user] = []
 62 |             user_history_dict[user].append(item)#把item放入该用户的历史记录中
 63 | 
 64 |     #按照user_history_dict对其他两个数据也进行处理
 65 |     train_indices = [i for i in train_indices if rating_np[i][0] in user_history_dict]
 66 |     eval_indices = [i for i in eval_indices if rating_np[i][0] in user_history_dict]
 67 |     test_indices = [i for i in test_indices if rating_np[i][0] in user_history_dict]
 68 |     # print(len(train_indices), len(eval_indices), len(test_indices))
 69 | 
 70 |     train_data = rating_np[train_indices]
 71 |     eval_data = rating_np[eval_indices]
 72 |     test_data = rating_np[test_indices]
 73 | 
 74 |     return train_data, eval_data, test_data, user_history_dict
 75 | 
 76 | 
 77 | def load_kg(args):
 78 |     print('reading KG file ...')
 79 | 
 80 |     #载入KG数据集
 81 |     kg_file = '../data/' + args.dataset + '/kg_final'
 82 |     if os.path.exists(kg_file + '.npy'):
 83 |         kg_np = np.load(kg_file + '.npy')
 84 |     else:
 85 |         kg_np = np.loadtxt(kg_file + '.txt', dtype=np.int32)
 86 |         np.save(kg_file + '.npy', kg_np)
 87 | 
 88 |     n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
 89 |     n_relation = len(set(kg_np[:, 1]))
 90 | 
 91 |     kg = construct_kg(kg_np)#构建KG
 92 | 
 93 |     return n_entity, n_relation, kg
 94 | 
 95 | 
 96 | def construct_kg(kg_np):
 97 |     print('constructing knowledge graph ...')
 98 |     kg = collections.defaultdict(list)
 99 |     for head, relation, tail in kg_np:
100 |         kg[head].append((tail, relation))#就是按照head建立字典，将尾节点和关系放入到字典中
101 |     return kg
102 | 
103 | 
104 | #ripple多跳时，每跳的结果集
105 | def get_ripple_set(args, kg, user_history_dict):
106 |     print('constructing ripple set ...')
107 | 
108 |     # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...]
109 |     ripple_set = collections.defaultdict(list)
110 | 
111 |     for user in user_history_dict:#对于每个用户
112 |         for h in range(args.n_hop):#该用户的兴趣在KG多跳hop中
113 |             memories_h = []
114 |             memories_r = []
115 |             memories_t = []
116 | 
117 |             if h == 0:#如果不传播，上一跳的结果就直接是该用户的历史记录
118 |                 tails_of_last_hop = user_history_dict[user]
119 |             else:#去除上一跳的记录
120 |                 tails_of_last_hop = ripple_set[user][-1][2]
121 | 
122 |             #去除上一跳的三元组特征
123 |             for entity in tails_of_last_hop:
124 |                 for tail_and_relation in kg[entity]:
125 |                     memories_h.append(entity)
126 |                     memories_r.append(tail_and_relation[1])
127 |                     memories_t.append(tail_and_relation[0])
128 | 
129 |             # if the current ripple set of the given user is empty, we simply copy the ripple set of the last hop here
130 |             # this won't happen for h = 0, because only the items that appear in the KG have been selected
131 |             # this only happens on 154 users in Book-Crossing dataset (since both BX dataset and the KG are sparse)
132 |             if len(memories_h) == 0:
133 |                 ripple_set[user].append(ripple_set[user][-1])
134 |             else:
135 |                 #为每个用户采样固定大小的邻居
136 |                 replace = len(memories_h) < args.n_memory
137 |                 indices = np.random.choice(len(memories_h), size=args.n_memory, replace=replace)
138 |                 memories_h = [memories_h[i] for i in indices]
139 |                 memories_r = [memories_r[i] for i in indices]
140 |                 memories_t = [memories_t[i] for i in indices]
141 |                 ripple_set[user].append((memories_h, memories_r, memories_t))
142 | 
143 |     return ripple_set
144 | 


--------------------------------------------------------------------------------
/RippleNet/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 29, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import argparse
 7 | import numpy as np
 8 | from data_loader import load_data
 9 | from train import train
10 | 
11 | np.random.seed(555)#可复现随机种子
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--dataset', type=str, default='movie', help='which dataset to use')#数据集
15 | parser.add_argument('--dim', type=int, default=16, help='dimension of entity and relation embeddings')#嵌入维度
16 | parser.add_argument('--n_hop', type=int, default=2, help='maximum hops')#跳数
17 | parser.add_argument('--kge_weight', type=float, default=0.01, help='weight of the KGE term')#kge权重
18 | parser.add_argument('--l2_weight', type=float, default=1e-7, help='weight of the l2 regularization term')#L2
19 | parser.add_argument('--lr', type=float, default=0.02, help='learning rate')#学习率
20 | parser.add_argument('--batch_size', type=int, default=1024, help='batch size')#批次大小
21 | parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs')#周期
22 | parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop')#每一跳的记忆大小
23 | parser.add_argument('--item_update_mode', type=str, default='plus_transform',
24 |                     help='how to update item at the end of each hop')#更新每一跳结果的方式
25 | parser.add_argument('--using_all_hops', type=bool, default=True,
26 |                     help='whether using outputs of all hops or just the last hop when making prediction')#得到user向量的方式
27 | 
28 | #后两维的参数都在model函数中发挥作用。
29 | '''
30 | # default settings for Book-Crossing
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('--dataset', type=str, default='book', help='which dataset to use')
33 | parser.add_argument('--dim', type=int, default=4, help='dimension of entity and relation embeddings')
34 | parser.add_argument('--n_hop', type=int, default=2, help='maximum hops')
35 | parser.add_argument('--kge_weight', type=float, default=1e-2, help='weight of the KGE term')
36 | parser.add_argument('--l2_weight', type=float, default=1e-5, help='weight of the l2 regularization term')
37 | parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
38 | parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
39 | parser.add_argument('--n_epoch', type=int, default=10, help='the number of epochs')
40 | parser.add_argument('--n_memory', type=int, default=32, help='size of ripple set for each hop')
41 | parser.add_argument('--item_update_mode', type=str, default='plus_transform',
42 |                     help='how to update item at the end of each hop')
43 | parser.add_argument('--using_all_hops', type=bool, default=True,
44 |                     help='whether using outputs of all hops or just the last hop when making prediction')
45 | '''
46 | 
47 | args = parser.parse_args()
48 | 
49 | show_loss = False
50 | data_info = load_data(args)
51 | train(args, data_info, show_loss)
52 | 


--------------------------------------------------------------------------------
/RippleNet/train.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 29, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | from model import RippleNet
 9 | 
10 | 
11 | def train(args, data_info, show_loss):
12 |     train_data = data_info[0]
13 |     eval_data = data_info[1]
14 |     test_data = data_info[2]
15 |     n_entity = data_info[3]
16 |     n_relation = data_info[4]
17 |     ripple_set = data_info[5]
18 | 
19 |     model = RippleNet(args, n_entity, n_relation)#实例化模型
20 | 
21 |     with tf.Session() as sess:
22 |         sess.run(tf.global_variables_initializer())
23 |         for step in range(args.n_epoch):
24 |             # training，开始训练
25 |             np.random.shuffle(train_data)#打乱训练数据
26 |             start = 0
27 |             while start < train_data.shape[0]:#计算loss
28 |                 _, loss = model.train(
29 |                     sess, get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
30 |                 start += args.batch_size
31 |                 if show_loss:
32 |                     print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss))
33 | 
34 |             # evaluation，开始在三个数据集上进行评估
35 |             train_auc, train_acc = evaluation(sess, args, model, train_data, ripple_set, args.batch_size)
36 |             eval_auc, eval_acc = evaluation(sess, args, model, eval_data, ripple_set, args.batch_size)
37 |             test_auc, test_acc = evaluation(sess, args, model, test_data, ripple_set, args.batch_size)
38 | 
39 |             print('epoch %d    train auc: %.4f  acc: %.4f    eval auc: %.4f  acc: %.4f    test auc: %.4f  acc: %.4f'
40 |                   % (step, train_auc, train_acc, eval_auc, eval_acc, test_auc, test_acc))
41 | 
42 | 
43 | def get_feed_dict(args, model, data, ripple_set, start, end):
44 |     feed_dict = dict()
45 |     feed_dict[model.items] = data[start:end, 1]
46 |     feed_dict[model.labels] = data[start:end, 2]
47 |     for i in range(args.n_hop):#喂入ripple_set每一跳的结果
48 |         feed_dict[model.memories_h[i]] = [ripple_set[user][i][0] for user in data[start:end, 0]]
49 |         feed_dict[model.memories_r[i]] = [ripple_set[user][i][1] for user in data[start:end, 0]]
50 |         feed_dict[model.memories_t[i]] = [ripple_set[user][i][2] for user in data[start:end, 0]]
51 |     return feed_dict
52 | 
53 | 
54 | def evaluation(sess, args, model, data, ripple_set, batch_size):
55 |     start = 0
56 |     auc_list = []
57 |     acc_list = []
58 |     while start < data.shape[0]:#只对测试集进行评估
59 |         auc, acc = model.eval(sess, get_feed_dict(args, model, data, ripple_set, start, start + batch_size))
60 |         auc_list.append(auc)
61 |         acc_list.append(acc)
62 |         start += batch_size
63 |     return float(np.mean(auc_list)), float(np.mean(acc_list))
64 | 


--------------------------------------------------------------------------------
/S2VT/Attention.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 14, 2021
 3 | @author: nakaizura
 4 | '''
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | class Attention(nn.Module):
11 |     """
12 |     注意力机制用到从decoder出来的特征上，即为了更好的学习上下文特征
13 |     """
14 | 
15 |     def __init__(self, dim):
16 |         super(Attention, self).__init__()
17 |         self.dim = dim
18 |         self.linear1 = nn.Linear(dim * 2, dim)
19 |         self.linear2 = nn.Linear(dim, 1, bias=False)
20 |         #self._init_hidden()
21 | 
22 |     def _init_hidden(self):
23 |         #xavier初始化
24 |         nn.init.xavier_normal_(self.linear1.weight)
25 |         nn.init.xavier_normal_(self.linear2.weight)
26 | 
27 |     def forward(self, hidden_state, encoder_outputs):
28 |         """
29 |         输入的参数:
30 |             hidden_state {Variable} -- batch_size x dim
31 |             encoder_outputs {Variable} -- batch_size x seq_len x dim
32 |         返回的结果:
33 |             Variable -- context vector of size batch_size x dim
34 |         """
35 |         batch_size, seq_len, _ = encoder_outputs.size() #得到Encoder出来的中间层维度
36 |         hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1)#多增加seq_len的维度以便后面concat
37 |         #拼接Encoder和hidden向量并展平
38 |         inputs = torch.cat((encoder_outputs, hidden_state),
39 |                            2).view(-1, self.dim * 2)
40 |         #两层FC
41 |         o = self.linear2(F.tanh(self.linear1(inputs)))
42 |         e = o.view(batch_size, seq_len)
43 |         alpha = F.softmax(e, dim=1)
44 |         context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1) #返回上下文
45 |         return context
46 | 


--------------------------------------------------------------------------------
/S2VT/EncoderRNN.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 14, 2021
 3 | @author: nakaizura
 4 | '''
 5 | import torch.nn as nn
 6 | 
 7 | #编码器负责表示输入的视频特征
 8 | 
 9 | class EncoderRNN(nn.Module):
10 |     def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5,
11 |                  n_layers=1, bidirectional=False, rnn_cell='gru'):
12 |         super(EncoderRNN, self).__init__()
13 |         self.dim_vid = dim_vid #视觉维度
14 |         self.dim_hidden = dim_hidden #RNN隐层维度
15 |         self.input_dropout_p = input_dropout_p #输出序列的dropout
16 |         self.rnn_dropout_p = rnn_dropout_p #隐层之后的dropout
17 |         self.n_layers = n_layers #rnn层数
18 |         self.bidirectional = bidirectional #是否双向
19 |         self.rnn_cell = rnn_cell #rnn种类，有LSTM和GRU可选
20 | 
21 |         self.vid2hid = nn.Linear(dim_vid, dim_hidden) #从视觉到隐层
22 |         self.input_dropout = nn.Dropout(input_dropout_p) #dropout
23 | 
24 |         if rnn_cell.lower() == 'lstm':
25 |             self.rnn_cell = nn.LSTM
26 |         elif rnn_cell.lower() == 'gru':
27 |             self.rnn_cell = nn.GRU
28 | 
29 |         self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True,
30 |                                 bidirectional=bidirectional, dropout=self.rnn_dropout_p)
31 | 
32 |         self._init_hidden()
33 | 
34 |     def _init_hidden(self):
35 |         nn.init.xavier_normal_(self.vid2hid.weight)
36 | 
37 |     def forward(self, vid_feats):
38 |         """
39 |         Applies a multi-layer RNN to an input sequence.
40 |         Args:
41 |             input_var (batch, seq_len): tensor containing the features of the input sequence.
42 |             input_lengths (list of int, optional): A list that contains the lengths of sequences
43 |               in the mini-batch
44 |         Returns: output, hidden
45 |             - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
46 |             - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
47 |         """
48 |         batch_size, seq_len, dim_vid = vid_feats.size()
49 |         vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid))#embedding
50 |         vid_feats = self.input_dropout(vid_feats) #dropout
51 |         vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden)#维度变换
52 |         self.rnn.flatten_parameters() #优化内存
53 |         output, hidden = self.rnn(vid_feats) #输到rnn，得到中间结果
54 |         return output, hidden
55 | 


--------------------------------------------------------------------------------
/S2VT/README.md:
--------------------------------------------------------------------------------
 1 | # Sequence to sequence: video to text （S2VT）
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://nakaizura.blog.csdn.net/article/details/114760719
 6 | 
 7 | 建议阅读顺序：S2VTModel-->S2VTAttModel-->EncoderRNN-->Attention-->DecoderRNN-->S2VTAttModel
 8 | 
 9 | # 
10 | 
11 | 原paper：sequence to sequence: video to text, 2015.
12 | 
13 | 原code： https://github.com/xiadingZ/video-caption.pytorch
14 | 


--------------------------------------------------------------------------------
/S2VT/S2VTAttModel.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 14, 2021
 3 | @author: nakaizura
 4 | '''
 5 | import torch.nn as nn
 6 | 
 7 | #加了att版本的S2VT模型，基本就是调用S2VT的各函数
 8 | 
 9 | class S2VTAttModel(nn.Module):
10 |     def __init__(self, encoder, decoder):
11 |         """
12 |         参数:
13 |             encoder (nn.Module): Encoder rnn
14 |             decoder (nn.Module): Decoder rnn
15 |         """
16 |         super(S2VTAttModel, self).__init__()
17 |         self.encoder = encoder
18 |         self.decoder = decoder
19 | 
20 |     def forward(self, vid_feats, target_variable=None,
21 |                 mode='train', opt={}):
22 |         """
23 |         Args:
24 |             vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid]
25 |             target_variable (None, optional): groung truth labels
26 |         Returns:
27 |             seq_prob: Variable of shape [batch_size, max_len-1, vocab_size]
28 |             seq_preds: [] or Variable of shape [batch_size, max_len-1]
29 |         """
30 |         encoder_outputs, encoder_hidden = self.encoder(vid_feats)
31 |         seq_prob, seq_preds = self.decoder(encoder_outputs, encoder_hidden, target_variable, mode, opt)
32 |         return seq_prob, seq_preds
33 | 


--------------------------------------------------------------------------------
/S2VT/S2VTModel.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 14, 2021
 3 | @author: nakaizura
 4 | '''
 5 | import torch
 6 | from torch import nn
 7 | import torch.nn.functional as F
 8 | import random
 9 | from torch.autograd import Variable
10 | 
11 | #按照S2VT的结构是2层LSTM，第一层是编码帧序列
12 | #第二层是接受第一层的隐层状态+与0填充之后再编码（很多对应位置没有值，直接pad为0来填充）
13 | 
14 | class S2VTModel(nn.Module):
15 |     def __init__(self, vocab_size, max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=1, eos_id=0,
16 |                  n_layers=1, rnn_cell='gru', rnn_dropout_p=0.2):
17 |         super(S2VTModel, self).__init__()
18 |         #可选择是LSTM或GRU两种
19 |         if rnn_cell.lower() == 'lstm':
20 |             self.rnn_cell = nn.LSTM
21 |         elif rnn_cell.lower() == 'gru':
22 |             self.rnn_cell = nn.GRU
23 |         self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers,
24 |                                   batch_first=True, dropout=rnn_dropout_p)
25 |         self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers,
26 |                                   batch_first=True, dropout=rnn_dropout_p)
27 | 
28 |         self.dim_vid = dim_vid #视频维度
29 |         self.dim_output = vocab_size #词表大小
30 |         self.dim_hidden = dim_hidden #隐层维度
31 |         self.dim_word = dim_word #词维度
32 |         self.max_length = max_len #最大长度
33 |         self.sos_id = sos_id #开始符
34 |         self.eos_id = eos_id #结束符
35 |         self.embedding = nn.Embedding(self.dim_output, self.dim_word) #编码词表的词，因为是one-hot所以直接是词表大小
36 | 
37 |         self.out = nn.Linear(self.dim_hidden, self.dim_output) #用于输出的fc
38 | 
39 |     def forward(self, vid_feats, target_variable=None,
40 |                 mode='train', opt={}):
41 |         batch_size, n_frames, _ = vid_feats.shape #视觉特征维度
42 |         #两种pad填充，frame和word
43 |         padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_()
44 |         padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_()
45 |         #两种开始状态，frame和word都为none
46 |         state1 = None
47 |         state2 = None
48 |         #self.rnn1.flatten_parameters()
49 |         #self.rnn2.flatten_parameters()
50 |         output1, state1 = self.rnn1(vid_feats, state1) #第一层LSTM
51 |         input2 = torch.cat((output1, padding_words), dim=2) #凭借输出的隐层和0填充的pad
52 |         output2, state2 = self.rnn2(input2, state2) #然后输到第二层
53 | 
54 |         seq_probs = []
55 |         seq_preds = []
56 |         if mode == 'train': #训练模式
57 |             for i in range(self.max_length - 1):
58 |                 # <eos> doesn't input to the network
59 |                 current_words = self.embedding(target_variable[:, i]) #嵌入当前词
60 |                 #重置参数的数据指针，使内存更contiguous(连续性)，利用率高
61 |                 self.rnn1.flatten_parameters()
62 |                 self.rnn2.flatten_parameters()
63 |                 #逐词的输出都要通过pad，过两层LSTM得到结果
64 |                 output1, state1 = self.rnn1(padding_frames, state1) 
65 |                 input2 = torch.cat(
66 |                     (output1, current_words.unsqueeze(1)), dim=2)
67 |                 output2, state2 = self.rnn2(input2, state2)
68 |                 logits = self.out(output2.squeeze(1)) #预测概率
69 |                 logits = F.log_softmax(logits, dim=1)
70 |                 seq_probs.append(logits.unsqueeze(1))
71 |             seq_probs = torch.cat(seq_probs, 1)
72 | 
73 |         else: #测试模式
74 |             current_words = self.embedding(
75 |                 Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda())#嵌入当前词
76 |             for i in range(self.max_length - 1):
77 |                  #重置参数的数据指针，使内存更contiguous(连续性)，利用率高
78 |                 self.rnn1.flatten_parameters()
79 |                 self.rnn2.flatten_parameters()
80 |                 #逐词的输出都要通过pad，过两层LSTM得到结果
81 |                 output1, state1 = self.rnn1(padding_frames, state1)
82 |                 input2 = torch.cat(
83 |                     (output1, current_words.unsqueeze(1)), dim=2)
84 |                 output2, state2 = self.rnn2(input2, state2)
85 |                 logits = self.out(output2.squeeze(1))
86 |                 logits = F.log_softmax(logits, dim=1)
87 |                 seq_probs.append(logits.unsqueeze(1))
88 |                 _, preds = torch.max(logits, 1)
89 |                 current_words = self.embedding(preds) #得到结果词
90 |                 seq_preds.append(preds.unsqueeze(1))
91 |             seq_probs = torch.cat(seq_probs, 1)
92 |             seq_preds = torch.cat(seq_preds, 1)
93 |         return seq_probs, seq_preds
94 | 


--------------------------------------------------------------------------------
/SR-GNN/README.md:
--------------------------------------------------------------------------------
 1 | # Session-based Recommendation with Graph Neural Networks（SR-GNN）
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | * 建议阅读顺序：utils-->model-->main
 5 | 
 6 | * blog解读：https://blog.csdn.net/qq_39388410/article/details/106413118
 7 | 
 8 | #
 9 | 原paper： https://arxiv.org/abs/1811.00855
10 | 
11 | 原code： https://github.com/CRIPAC-DIG/SR-GNN
12 | 


--------------------------------------------------------------------------------
/SR-GNN/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 22, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import argparse
 7 | import pickle
 8 | import time
 9 | from utils import build_graph, Data, split_validation
10 | from model import *
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--dataset', default='sample', help='dataset name: diginetica/yoochoose1_4/yoochoose1_64/sample')
14 | parser.add_argument('--batchSize', type=int, default=100, help='input batch size')#批次大小
15 | parser.add_argument('--hiddenSize', type=int, default=100, help='hidden state size')#隐层size
16 | parser.add_argument('--epoch', type=int, default=30, help='the number of epochs to train for')#训练周期
17 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate')  # 学习率，[0.001, 0.0005, 0.0001]
18 | parser.add_argument('--lr_dc', type=float, default=0.1, help='learning rate decay rate')#学习率衰减
19 | parser.add_argument('--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay')#衰减步
20 | parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty')  # L2，[0.001, 0.0005, 0.0001, 0.00005, 0.00001]
21 | parser.add_argument('--step', type=int, default=1, help='gnn propogation steps')#GNN的传播步数
22 | parser.add_argument('--patience', type=int, default=10, help='the number of epoch to wait before early stop ')#早停
23 | parser.add_argument('--nonhybrid', action='store_true', help='only use the global preference to predict')#只使用global，没有local
24 | parser.add_argument('--validation', action='store_true', help='validation')#是否设置验证集
25 | parser.add_argument('--valid_portion', type=float, default=0.1, help='split the portion of training set as validation set')#比率
26 | opt = parser.parse_args()
27 | print(opt)
28 | 
29 | 
30 | def main():
31 |     train_data = pickle.load(open('../datasets/' + opt.dataset + '/train.txt', 'rb'))#载入数据集
32 |     if opt.validation:
33 |         train_data, valid_data = split_validation(train_data, opt.valid_portion)#切分数据集
34 |         test_data = valid_data #此时测试集和验证集一样
35 |     else:
36 |         test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb'))
37 |     # all_train_seq = pickle.load(open('../datasets/' + opt.dataset + '/all_train_seq.txt', 'rb'))
38 |     # g = build_graph(all_train_seq)
39 |     train_data = Data(train_data, shuffle=True)#打乱训练集
40 |     test_data = Data(test_data, shuffle=False)
41 |     # del all_train_seq, g
42 |     if opt.dataset == 'diginetica':
43 |         n_node = 43098
44 |     elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4':
45 |         n_node = 37484
46 |     else:
47 |         n_node = 310
48 | 
49 |     model = trans_to_cuda(SessionGraph(opt, n_node)) #实例化模型
50 | 
51 |     start = time.time()
52 |     best_result = [0, 0]
53 |     best_epoch = [0, 0]
54 |     bad_counter = 0
55 |     for epoch in range(opt.epoch): #开始训练
56 |         print('-------------------------------------------------------')
57 |         print('epoch: ', epoch)
58 |         hit, mrr = train_test(model, train_data, test_data)
59 |         flag = 0
60 |         if hit >= best_result[0]: #计算hit和mrr
61 |             best_result[0] = hit
62 |             best_epoch[0] = epoch
63 |             flag = 1
64 |         if mrr >= best_result[1]:
65 |             best_result[1] = mrr
66 |             best_epoch[1] = epoch
67 |             flag = 1
68 |         print('Best Result:')
69 |         print('\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d,\t%d'% (best_result[0], best_result[1], best_epoch[0], best_epoch[1]))
70 |         bad_counter += 1 - flag
71 |         if bad_counter >= opt.patience: #早停
72 |             break
73 |     print('-------------------------------------------------------')
74 |     end = time.time()
75 |     print("Run time: %f s" % (end - start))
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/SR-GNN/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 22, 2020
  3 | @author: nakaizura
  4 | '''
  5 | import networkx as nx
  6 | import numpy as np
  7 | 
  8 | #networkx是专门用来存储图，构建图和分析图的库，操作真的超级方便。
  9 | 
 10 | def build_graph(train_data):
 11 |     #构图
 12 |     graph = nx.DiGraph()#Digraph是有向图的基类
 13 |     for seq in train_data: #对于一个session序列
 14 |         for i in range(len(seq) - 1):
 15 |             if graph.get_edge_data(seq[i], seq[i + 1]) is None: #遍历相邻的节点
 16 |                 weight = 1 #如果两个节点之间没有边，那么设置为1
 17 |             else: #如果存在，那么给边权重增加1
 18 |                 weight = graph.get_edge_data(seq[i], seq[i + 1])['weight'] + 1
 19 |             graph.add_edge(seq[i], seq[i + 1], weight=weight)#添加修改后的边
 20 |     for node in graph.nodes: #遍历所有的节点
 21 |         sum = 0
 22 |         for j, i in graph.in_edges(node):
 23 |             sum += graph.get_edge_data(j, i)['weight']
 24 |         if sum != 0:
 25 |             for j, i in graph.in_edges(i): 
 26 |                 graph.add_edge(j, i, weight=graph.get_edge_data(j, i)['weight'] / sum)
 27 |     return graph
 28 | 
 29 | 
 30 | def data_masks(all_usr_pois, item_tail):
 31 |     #统一user的session的长度为最长的那个，其他的地方补item_tail，但是同时需要mask来标记0
 32 |     us_lens = [len(upois) for upois in all_usr_pois]#所有的长度
 33 |     len_max = max(us_lens) #选最大
 34 |     us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
 35 |     us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
 36 |     return us_pois, us_msks, len_max
 37 | 
 38 | 
 39 | def split_validation(train_set, valid_portion):
 40 |     #切分数据集
 41 |     train_set_x, train_set_y = train_set
 42 |     n_samples = len(train_set_x)
 43 |     sidx = np.arange(n_samples, dtype='int32') #item编号
 44 |     np.random.shuffle(sidx) #打乱
 45 |     n_train = int(np.round(n_samples * (1. - valid_portion)))#采样比率
 46 |     #切分为验证集和训练集
 47 |     valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
 48 |     valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
 49 |     train_set_x = [train_set_x[s] for s in sidx[:n_train]]
 50 |     train_set_y = [train_set_y[s] for s in sidx[:n_train]]
 51 | 
 52 |     return (train_set_x, train_set_y), (valid_set_x, valid_set_y)
 53 | 
 54 | 
 55 | class Data(): #处理数据，主要是得到batch和建立邻接矩阵
 56 |     def __init__(self, data, shuffle=False, graph=None):
 57 |         inputs = data[0]
 58 |         inputs, mask, len_max = data_masks(inputs, [0])
 59 |         self.inputs = np.asarray(inputs)
 60 |         self.mask = np.asarray(mask)
 61 |         self.len_max = len_max
 62 |         self.targets = np.asarray(data[1])
 63 |         self.length = len(inputs)
 64 |         self.shuffle = shuffle
 65 |         self.graph = graph
 66 | 
 67 |     def generate_batch(self, batch_size):
 68 |         if self.shuffle:
 69 |             shuffled_arg = np.arange(self.length)#编号
 70 |             np.random.shuffle(shuffled_arg)#打乱顺序
 71 |             self.inputs = self.inputs[shuffled_arg]#input
 72 |             self.mask = self.mask[shuffled_arg]#mask
 73 |             self.targets = self.targets[shuffled_arg]#target
 74 |         n_batch = int(self.length / batch_size)#切分batch
 75 |         if self.length % batch_size != 0:
 76 |             n_batch += 1 #不能完全除尽的其余部分也要算作一个batch
 77 |         slices = np.split(np.arange(n_batch * batch_size), n_batch)
 78 |         slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))]
 79 |         return slices
 80 | 
 81 |     def get_slice(self, i):
 82 |         inputs, mask, targets = self.inputs[i], self.mask[i], self.targets[i]
 83 |         items, n_node, A, alias_inputs = [], [], [], []
 84 |         for u_input in inputs:
 85 |             n_node.append(len(np.unique(u_input)))
 86 |         max_n_node = np.max(n_node)#最大的session的item数目
 87 |         for u_input in inputs:
 88 |             node = np.unique(u_input)#unique的item
 89 |             items.append(node.tolist() + (max_n_node - len(node)) * [0])#不够的补0
 90 |             u_A = np.zeros((max_n_node, max_n_node))#user的邻接矩阵
 91 |             for i in np.arange(len(u_input) - 1):
 92 |                 if u_input[i + 1] == 0: #为0说明这个session已经结束了
 93 |                     break
 94 |                 u = np.where(node == u_input[i])[0][0]
 95 |                 v = np.where(node == u_input[i + 1])[0][0]
 96 |                 u_A[u][v] = 1
 97 |             #最终想要的邻接矩阵A是入度和出度A(in)和A(out)矩阵拼接而成的(n, 2n)的矩阵
 98 |             u_sum_in = np.sum(u_A, 0) #按0维度sum，即入度总数
 99 |             u_sum_in[np.where(u_sum_in == 0)] = 1 #防止没有某节点没有入度而除了0
100 |             u_A_in = np.divide(u_A, u_sum_in) #平均一下
101 |             u_sum_out = np.sum(u_A, 1) #同理按1sum，算一下出度
102 |             u_sum_out[np.where(u_sum_out == 0)] = 1
103 |             u_A_out = np.divide(u_A.transpose(), u_sum_out) #需要转置一下
104 |             u_A = np.concatenate([u_A_in, u_A_out]).transpose() #最后拼接两者
105 |             A.append(u_A)
106 |             alias_inputs.append([np.where(node == i)[0][0] for i in u_input])
107 |         return alias_inputs, A, items, mask, targets
108 | 


--------------------------------------------------------------------------------
/Skip-Thought Vectors/Evaluate.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | from model import UniSkip, Encoder
 6 | from data_loader import DataLoader
 7 | from vocab import load_dictionary
 8 | from config import *
 9 | from torch import nn
10 | 
11 | from torch.autograd import Variable
12 | import torch
13 | 
14 | class UsableEncoder:
15 |     
16 |     def __init__(self, loc="./saved_models/skip-best"):#导入之前训练得到的最好模型
17 |         print("Preparing the DataLoader. Loading the word dictionary")
18 |         self.d = DataLoader(sentences=[''], word_dict=load_dictionary('./data/dummy_corpus.txt.pkl'))
19 |         self.encoder = None
20 |         
21 |         print("Loading encoder from the saved model at {}".format(loc))
22 |         model = UniSkip()#载入模型
23 |         model.load_state_dict(torch.load(loc, map_location=lambda storage, loc: storage))
24 |         self.encoder = model.encoder
25 |         if USE_CUDA:
26 |             self.encoder.cuda(CUDA_DEVICE)#gpu
27 |     
28 |     def encode(self, text):
29 |         def chunks(l, n):
30 |             """Yield successive n-sized chunks from l."""
31 |             for i in range(0, len(l), n):
32 |                 yield l[i:i + n]
33 |         
34 |         ret = []
35 |         
36 |         for chunk in chunks(text, 100):#每次往后增加词
37 |             print("encoding chunk of size {}".format(len(chunk)))
38 |             indices = [self.d.convert_sentence_to_indices(sentence) for sentence in chunk]
39 |             indices = torch.stack(indices)
40 |             indices, _ = self.encoder(indices)#编码
41 |             indices = indices.view(-1, self.encoder.thought_size)
42 |             indices = indices.data.cpu().numpy()
43 |             
44 |             ret.extend(indices)
45 |         ret = np.array(ret)
46 |         
47 |         return ret
48 | 
49 | usable_encoder = UsableEncoder()
50 | 
51 | from tasks.eval_classification import *
52 | #载入数据进行模型评估
53 | eval_nested_kfold(usable_encoder, "MR", loc='./tasks/mr_data/', k=3, seed=1234, use_nb=False)
54 | 


--------------------------------------------------------------------------------
/Skip-Thought Vectors/README.md:
--------------------------------------------------------------------------------
 1 | # Skip-Thought Vectors
 2 | 
 3 | 源码笔记：思路和word2vec的skip类似，一个句子预测它的前一个句子和后一个句子，句子之间用lstm编码和解码就行了。
 4 | 
 5 | 建议阅读顺序：data_loader-->vocab-->model-->train-->evaluate
 6 | 
 7 | #
 8 | 
 9 | 原paper：
10 | ```
11 | @article{kiros2015skip,
12 |   title={Skip-Thought Vectors},
13 |   author={Kiros, Ryan and Zhu, Yukun and Salakhutdinov, Ruslan and Zemel, Richard S and Torralba, Antonio and Urtasun, Raquel and Fidler, Sanja},
14 |   journal={arXiv preprint arXiv:1506.06726},
15 |   year={2015}
16 | }
17 | ```
18 | 
19 | 原code： https://github.com/ryankiros/skip-thoughts
20 | 
21 | #
22 | 原code是Tensorflow，按照作者的教程很容易训练，这是Pytorch版的实现：
23 | 
24 | http://sanyam5.github.io/my-thoughts-on-skip-thoughts/
25 | 


--------------------------------------------------------------------------------
/Skip-Thought Vectors/data_loader.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import torch
 7 | from torch.autograd import Variable
 8 | from vocab import *
 9 | from config import *
10 | import numpy as np
11 | import random
12 | 
13 | np.random.seed(0)#可复现随机种子
14 | 
15 | #载入数据
16 | 
17 | class DataLoader:
18 |     EOS = 0  #end of sentence，句子的结尾
19 |     UNK = 1  #unknown token，未知的词
20 | 
21 |     maxlen = MAXLEN #最大长度
22 | 
23 |     def __init__(self, text_file=None, sentences=None, word_dict=None):
24 | 
25 |         if text_file:#读句子文件生成单词字典
26 |             print("Loading text file at {}".format(text_file))
27 |             with open(text_file, "rt") as f:
28 |                 sentences = f.readlines()
29 |             print("Making dictionary for these words")
30 |             word_dict = build_and_save_dictionary(sentences, source=text_file)#单词字典
31 | 
32 |         assert sentences and word_dict, "Please provide the file to extract from or give sentences and word_dict"
33 | 
34 |         self.sentences = sentences
35 |         self.word_dict = word_dict
36 |         print("Making reverse dictionary")
37 |         self.revmap = list(self.word_dict.items())
38 | 
39 |         self.lengths = [len(sent) for sent in self.sentences]
40 | 
41 |     def convert_sentence_to_indices(self, sentence):
42 |         #句子编码
43 |         indices = [
44 |                       #编码int给每个词，如果词太稀疏了就设置为UNK(unknown token)
45 |                       self.word_dict.get(w) if self.word_dict.get(w, VOCAB_SIZE + 1) < VOCAB_SIZE else self.UNK
46 | 
47 |                       for w in sentence.split()  #用空格分出所有词，然后得到词的数量
48 |                   ][: self.maxlen - 1]  #最多maxlen的长度
49 | 
50 |         #最后一个单词设置为EOS
51 |         indices += [self.EOS] * (self.maxlen - len(indices))
52 | 
53 |         indices = np.array(indices)
54 |         indices = Variable(torch.from_numpy(indices))
55 |         if USE_CUDA:
56 |             indices = indices.cuda(CUDA_DEVICE) #gpu
57 | 
58 |         return indices
59 | 
60 |     def convert_indices_to_sentences(self, indices):
61 |         #由编码到句子
62 |         def convert_index_to_word(idx):
63 | 
64 |             idx = idx.data[0]
65 |             if idx == 0:#根据idx返回对应的词
66 |                 return "EOS"
67 |             elif idx == 1:
68 |                 return "UNK"
69 |             
70 |             search_idx = idx - 2 #然后按revmap出word
71 |             if search_idx >= len(self.revmap):
72 |                 return "NA"
73 |             
74 |             word, idx_ = self.revmap[search_idx]
75 | 
76 |             assert idx_ == idx
77 |             return word
78 | 
79 |         words = [convert_index_to_word(idx) for idx in indices]
80 | 
81 |         return " ".join(words)#用空格拼接所有的词成句子
82 | 
83 |     def fetch_batch(self, batch_size):
84 |         #得到batch。先随机选开头
85 |         first_index = random.randint(0, len(self.sentences) - batch_size)
86 |         batch = []
87 |         lengths = []
88 | 
89 |         for i in range(first_index, first_index + batch_size):#再挑够数目
90 |             sent = self.sentences[i]#一整个句子
91 |             ind = self.convert_sentence_to_indices(sent)
92 |             batch.append(ind)
93 |             lengths.append(min(len(sent.split()), MAXLEN))
94 | 
95 |         batch = torch.stack(batch)#shape一下变堆叠
96 |         lengths = np.array(lengths)
97 | 
98 |         return batch, lengths
99 | 


--------------------------------------------------------------------------------
/Skip-Thought Vectors/train.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | import torch
 6 | from torch import nn
 7 | from torch.autograd import Variable
 8 | 
 9 | from data_loader import DataLoader
10 | from model import UniSkip
11 | from config import *
12 | from datetime import datetime, timedelta
13 | 
14 | 
15 | #载入数据
16 | d = DataLoader("./data/dummy_corpus.txt")
17 | mod = UniSkip()
18 | if USE_CUDA:#gpu
19 |     mod.cuda(CUDA_DEVICE)
20 | 
21 | #定义优化器
22 | lr = 3e-4
23 | optimizer = torch.optim.Adam(params=mod.parameters(), lr=lr)
24 | 
25 | loss_trail = [] #记录loss
26 | last_best_loss = None
27 | current_time = datetime.utcnow()#获取时间对象
28 | 
29 | #定期打印结果用于debug
30 | def debug(i, loss, prev, nex, prev_pred, next_pred):
31 |     global loss_trail
32 |     global last_best_loss
33 |     global current_time
34 | 
35 |     this_loss = loss.data[0]
36 |     loss_trail.append(this_loss)
37 |     loss_trail = loss_trail[-20:]#最后20次
38 |     new_current_time = datetime.utcnow()
39 |     time_elapsed = str(new_current_time - current_time)#计算时间
40 |     current_time = new_current_time
41 |     print("Iteration {}: time = {} last_best_loss = {}, this_loss = {}".format(
42 |               i, time_elapsed, last_best_loss, this_loss))
43 |     
44 |     print("prev = {}\nnext = {}\npred_prev = {}\npred_next = {}".format(
45 |         d.convert_indices_to_sentences(prev),
46 |         d.convert_indices_to_sentences(nex),
47 |         d.convert_indices_to_sentences(prev_pred),
48 |         d.convert_indices_to_sentences(next_pred),
49 |     ))#把结果变成句子打印出来方便看
50 |     
51 |     try:
52 |         trail_loss = sum(loss_trail)/len(loss_trail)
53 |         if last_best_loss is None or last_best_loss > trail_loss:
54 |             print("Loss improved from {} to {}".format(last_best_loss, trail_loss))
55 |             #存模型
56 |             save_loc = "./saved_models/skip-best".format(lr, VOCAB_SIZE)
57 |             print("saving model at {}".format(save_loc))
58 |             torch.save(mod.state_dict(), save_loc)
59 |             
60 |             last_best_loss = trail_loss
61 |     except Exception as e:
62 |        print("Couldn't save model because {}".format(e))
63 | 
64 | print("Starting training...")
65 | 
66 | # a million iterations
67 | for i in range(0, 1000000):
68 |     sentences, lengths = d.fetch_batch(32 * 8)#生成batch
69 |     #得到预测
70 |     loss, prev, nex, prev_pred, next_pred  = mod(sentences, lengths)
71 |     
72 | 
73 |     if i % 10 == 0:#定期debug
74 |         debug(i, loss, prev, nex, prev_pred, next_pred)
75 | 
76 |     optimizer.zero_grad()#梯度清零
77 |     loss.backward()#反向传播
78 |     optimizer.step()#参数更新
79 | 


--------------------------------------------------------------------------------
/Skip-Thought Vectors/vocab.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 30, 2020
 3 | @author: nakaizura
 4 | '''
 5 | import _pickle as pkl
 6 | from collections import OrderedDict
 7 | import argparse
 8 | 
 9 | 
10 | def build_dictionary(text):
11 |     """建立一个词典
12 |     Build a dictionary
13 |     text: list of sentences (pre-tokenized)
14 |     """
15 |     wordcount = {}
16 |     for cc in text:
17 |         words = cc.split()
18 |         for w in words:
19 |             if w not in wordcount:#不在词典就新建
20 |                 wordcount[w] = 0
21 |             wordcount[w] += 1#加就计数
22 | 
23 |     #按词出现的频率排序
24 |     sorted_words = sorted(list(wordcount.keys()), key=lambda x: wordcount[x], reverse=True)
25 | 
26 |     worddict = OrderedDict()
27 |     for idx, word in enumerate(sorted_words):#必须加2，因为0和1两个词已经余弦定义了。
28 |         worddict[word] = idx+2 # 0: 句子结尾<eos>, 1: 未知词<unk>
29 | 
30 |     return worddict, wordcount
31 | 
32 | 
33 | def load_dictionary(loc='./data/book_dictionary_large.pkl'):
34 |     """载入字典
35 |     Load a dictionary
36 |     """
37 |     with open(loc, 'rb') as f:
38 |         worddict = pkl.load(f)
39 |     return worddict
40 | 
41 | 
42 | def save_dictionary(worddict, wordcount, loc='./data/book_dictionary_large.pkl'):
43 |     """保存字典
44 |     Save a dictionary to the specified location
45 |     """
46 |     with open(loc, 'wb') as f:
47 |         pkl.dump(worddict, f)
48 |         pkl.dump(wordcount, f)
49 | 
50 | 
51 | def build_and_save_dictionary(text, source):
52 |     save_loc = source+".pkl"
53 |     try:
54 |         cached = load_dictionary(save_loc)
55 |         print("Using cached dictionary at {}".format(save_loc))
56 |         return cached
57 |     except:
58 |         pass
59 |     # build again and save
60 |     print("unable to load from cached, building fresh")
61 |     worddict, wordcount = build_dictionary(text)
62 |     print("Got {} unique words".format(len(worddict)))
63 |     print("Saveing dictionary at {}".format(save_loc))
64 |     save_dictionary(worddict, wordcount, save_loc)
65 |     return worddict
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument("text_file", type=str)
71 |     args = parser.parse_args()
72 | 
73 |     print("Extracting text from {}".format(args.text_file))
74 |     text = open(args.text_file, "rt").readlines()
75 |     print("Extracting dictionary..")
76 |     worddict, wordcount = build_dictionary(text)
77 | 
78 |     out_file = args.text_file+".pkl"
79 |     print("Got {} unique words. Saving to file {}".format(len(worddict), out_file))
80 |     save_dictionary(worddict, wordcount, out_file)
81 |     print("Done.")
82 | 


--------------------------------------------------------------------------------
/SlowFast/README.md:
--------------------------------------------------------------------------------
 1 | # SlowFast Networks for Video Recognition(SlowFast)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读：https://blog.csdn.net/qq_39388410/article/details/104951012
 6 | 
 7 | #
 8 | 
 9 | 原paper： SlowFast Networks for Video Recognition
10 | 
11 | 原code： https://github.com/facebookresearch/SlowFast
12 | 
13 | #
14 | 
15 | keras版本：https://github.com/xuzheyuan624/slowfast-keras
16 | 
17 | pytorch版本：https://github.com/Sense-X/X-Temporal
18 | 


--------------------------------------------------------------------------------
/Transformer/README.md:
--------------------------------------------------------------------------------
 1 | # Attention is All You Need(Transformer)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | blog解读与复现：https://blog.csdn.net/qq_39388410/article/details/102081253
 6 | 
 7 | 建议阅读顺序：Transformer.py-->nn.Transformer.py
 8 | 
 9 | # 
10 | 
11 | 原paper：
12 | ```
13 | @inproceedings{opennmt,
14 |   author    = {Guillaume Klein and
15 |                Yoon Kim and
16 |                Yuntian Deng and
17 |                Jean Senellart and
18 |                Alexander M. Rush},
19 |   title     = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
20 |   booktitle = {Proc. ACL},
21 |   year      = {2017},
22 |   url       = {https://doi.org/10.18653/v1/P17-4012},
23 |   doi       = {10.18653/v1/P17-4012}
24 | }
25 | ```
26 | 
27 | 原code（harvardnlp的pytorch版）： http://nlp.seas.harvard.edu/2018/04/03/attention.html
28 | 
29 | # 
30 | 
31 | nn.Transformer已经已经出框架了...
32 | 
33 | 所以nn.Transformer.py这个是pytorch官方实例的代码，搭起来很方便。
34 | 


--------------------------------------------------------------------------------
/UIE/README.md:
--------------------------------------------------------------------------------
 1 | # Unified Structure Generation for Universal Information Extraction (UIE)
 2 | * 逐行源码阅读中文笔记。
 3 | * paddle转pytorch过程。
 4 | 
 5 | ### 逐行源码阅读中文笔记
 6 | 模型结构上UIE并没有特别的地方，论文中UIE使用T5做backbone，再百度开源的模型中使用ERNIE3.0。
 7 | 
 8 | 关键在于使用prompt改装各个抽取任务，做预训练。详细细节请参考原文。
 9 | 
10 | 整份代码最重要的是以下：
11 | 
12 | ```[CLS]+ prompt + [SEP] + Content + [SEP]```
13 | 
14 | 改写完成后输入到T5/ERNIE3.0即可。
15 | 
16 | ### paddle转pytorch
17 | 由于百度开源的模型是基于paddlepaddle的，所以博主自己写了个程序来转换。
18 | 
19 | 见convert.py
20 | 
21 | 有任何问题，欢迎私聊博主：https://nakaizura.blog.csdn.net/
22 | 
23 | 
24 | 
25 | # 
26 | 原paper：
27 | Title：
28 | Author：Yaojie Lu, Qing Liu, Dai Dai, Xinyan Xiao, Hongyu Lin, Xianpei Han, Le Sun, Hua Wu. Unified Structure Generation for Universal Information Extraction. ACL 2022.
29 | 
30 | 
31 | 原Demo：
32 | https://universal-ie.github.io/
33 | 


--------------------------------------------------------------------------------
/UIE/model.py:
--------------------------------------------------------------------------------
 1 | # nakaizuta 25.AUG.2022
 2 | 
 3 | import paddle
 4 | import paddle.nn as nn
 5 | from paddlenlp.transformers import ErniePretrainedModel
 6 | 
 7 | #这是UIE的模型文件，较为简单是大模型跟两个mlp来预测span。
 8 | 
 9 | class UIE(ErniePretrainedModel):
10 |     def __init__(self, encoding_model):
11 |         super(UIE, self).__init__()
12 |         self.encoder = encoding_model #此处制定模型为ERNIE或T5都可以
13 |         hidden_size = self.encoder.config["hidden_size"]
14 |         self.linear_start = paddle.nn.Linear(hidden_size, 1) #预测抽取词的开头位置
15 |         self.linear_end = paddle.nn.Linear(hidden_size, 1) #预测抽取词的结尾位置 
16 |         self.sigmoid = nn.Sigmoid()
17 | 
18 |     def forward(self, input_ids, token_type_ids, pos_ids, att_mask):
19 |         sequence_output, pooled_output = self.encoder(
20 |             input_ids=input_ids,
21 |             token_type_ids=token_type_ids, #用于标记是prompt还是sentence，其他参数同bert一致
22 |             position_ids=pos_ids,
23 |             attention_mask=att_mask)
24 |         start_logits = self.linear_start(sequence_output) #模型的结果通过linear预测位置概率
25 |         start_logits = paddle.squeeze(start_logits, -1)
26 |         start_prob = self.sigmoid(start_logits)
27 |         end_logits = self.linear_end(sequence_output) #预测结尾概率
28 |         end_logits = paddle.squeeze(end_logits, -1)
29 |         end_prob = self.sigmoid(end_logits)
30 |         return start_prob, end_prob #返回位置概率
31 |         
32 |         
33 | 


--------------------------------------------------------------------------------
/VMT/README.md:
--------------------------------------------------------------------------------
 1 | # Video-guided Machine Translation(VMT)
 2 | 
 3 | 逐行源码阅读中文笔记。
 4 | 
 5 | 建议阅读顺序：train--utils--dataloader--model
 6 | 
 7 | blog解读：https://nakaizura.blog.csdn.net/article/details/114760719
 8 | 
 9 | #
10 | 
11 | 原paper： VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research
12 | 
13 | 原code： https://github.com/eric-xw/Video-guided-Machine-Translation
14 | 


--------------------------------------------------------------------------------
/VMT/dataloader.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 23, 2021
 3 | @author: nakaizura
 4 | '''
 5 | 
 6 | import json
 7 | import numpy as np
 8 | import os 
 9 | 
10 | import torch 
11 | from torch.utils.data import Dataset, DataLoader 
12 | 
13 | 
14 | # 载入视频和句子数据
15 | 
16 | 
17 | def load_video_features(fpath, max_length):
18 |     feats = np.load(fpath, encoding='latin1')[0]  # encoding='latin1' to handle the inconsistency between python 2 and 3
19 |     # 载入视频特征。帧数少了要补0，多了要采样
20 |     if feats.shape[0] < max_length:
21 |         dis = max_length - feats.shape[0]
22 |         feats = np.lib.pad(feats, ((0, dis), (0, 0)), 'constant', constant_values=0)
23 |     elif feats.shape[0] > max_length:
24 |         inds = sorted(random.sample(range(feats.shape[0]), max_length))
25 |         feats = feats[inds]
26 |     assert feats.shape[0] == max_length #保证一致
27 |     return np.float32(feats)
28 | 
29 | class vatex_dataset(Dataset):
30 |     #载入文本的特征
31 |     def __init__(self, data_dir, file_path, img_dir, split_type, tokenizers, max_vid_len, pair):
32 |         src, tgt = pair
33 |         maps = {'en':'enCap', 'zh':'chCap'}
34 |         self.data_dir = data_dir
35 |         self.img_dir = img_dir
36 |         # tokenizer类，在utils.py中
37 |         self.tok_src, self.tok_tgt = tokenizers
38 |         self.max_vid_len = max_vid_len
39 |         self.split_type = split_type
40 | 
41 |         with open(self.data_dir+file_path, 'r') as file: #打开数据文件
42 |             data = json.load(file)
43 |         self.srccaps, self.tgtcaps = [], []
44 |         self.sent_ids = []
45 |         for d in data:
46 |             srccap = d[maps[src]][5:] #词表
47 |             self.srccaps.extend(srccap)
48 |             sent_id = [''.join((d['videoID'],'&',str(i))) for i in range(len(srccap))] #句子的id
49 |             self.sent_ids.extend(sent_id)
50 |             if split_type != 'test':
51 |                 tgtcap = d[maps[tgt]][5:]
52 |                 self.tgtcaps.extend(tgtcap)
53 | 
54 |     def __len__(self):
55 |         return len(self.srccaps)
56 | 
57 |     def __getitem__(self, idx):
58 |         str_srccap,  sent_id = self.srccaps[idx], self.sent_ids[idx]
59 |         vid = sent_id[:-2]
60 |         srccap, caplen_src = self.tok_src.encode_sentence(str_srccap) #句子变id
61 |         srcref = self.tok_src.encode_sentence_nopad_2str(str_srccap)
62 |         img = load_video_features(os.path.join(self.data_dir,'vatex_features/',self.img_dir,vid+'.npy'), self.max_vid_len)
63 |         if self.split_type != 'test': #会要计算loss，所以会多一些参数
64 |             str_tgtcap = self.tgtcaps[idx]
65 |             tgtcap, caplen_tgt = self.tok_tgt.encode_sentence(str_tgtcap)
66 |             tgtref = self.tok_tgt.encode_sentence_nopad_2str(str_tgtcap)
67 |             return srccap, tgtcap, img, caplen_src, caplen_tgt, srcref, tgtref
68 |         else:
69 |             return srccap, img, caplen_src, sent_id
70 | 
71 | def get_loader(data_dir, tokenizers, split_type, batch_size, max_vid_len, pair, num_workers, pin_memory):
72 |     maps = {'train':['vatex_training_v1.0.json', 'trainval'], 'val': ['vatex_validation_v1.0.json', 'trainval'], 
73 |         'test': ['vatex_public_test_english_v1.1.json', 'public_test']}
74 |     file_path, img_dir = maps[split_type]
75 |     mydata = vatex_dataset(data_dir, file_path, img_dir, split_type, tokenizers, max_vid_len, pair)
76 |     if split_type in ['train']: #打乱训练
77 |         shuffle = True
78 |     elif split_type in ['val', 'test']:
79 |         shuffle = False
80 |     #载入loader
81 |     myloader = DataLoader(dataset=mydata, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory)
82 |     return myloader
83 | 
84 | def create_split_loaders(data_dir, tokenizers, batch_size, max_vid_len, pair, num_workers=0, pin_memory=False):
85 |     #分别载入三者
86 |     train_loader = get_loader(data_dir, tokenizers, 'train', batch_size, max_vid_len, pair, num_workers, pin_memory)
87 |     val_loader = get_loader(data_dir, tokenizers, 'val', batch_size, max_vid_len, pair, num_workers, pin_memory)
88 |     test_loader = get_loader(data_dir, tokenizers, 'test', batch_size, max_vid_len, pair, num_workers, pin_memory)
89 |     # test_loader = [0]
90 | 
91 |     return train_loader, val_loader, test_loader
92 | 


--------------------------------------------------------------------------------
/extractive_summarization/README.md:
--------------------------------------------------------------------------------
 1 | # extractive_summarization
 2 | 这个中文文本摘要工具特别好用，为大家介绍一下。
 3 | 
 4 | 主要基于规则做抽取式摘要，
 5 | * 词权重和句权重。1）tfidf词权重、词性权重（如虚词打压）；2）LDA主题权重；3）标题加权。
 6 | * 摘要句选择。1）MMR：综合分数排序，语义差异最大化，直到满足限制条件（字数达到或分数过低）。
 7 | 
 8 | 十分简单好用。
 9 | 
10 | # 
11 | 原code地址：https://github.com/dongrixinyu/extractive_summary
12 | 


--------------------------------------------------------------------------------
/node2vec/README.md:
--------------------------------------------------------------------------------
 1 | # node2vec: Scalable Feature Learning for Networks(node2vec)
 2 | 逐行源码阅读中文笔记。
 3 | 
 4 | blog解读：https://blog.csdn.net/qq_39388410/article/details/103859078
 5 | 
 6 | #
 7 | 
 8 | 原paper： Grover A, Leskovec J. node2vec: Scalable feature learning for networks (KDD 2016)
 9 | 
10 | 浅梦大佬即插即玩的开源：https://github.com/shenweichen/GraphEmbedding/
11 | 


--------------------------------------------------------------------------------
/node2vec/classify.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | from __future__ import print_function
 6 | 
 7 | 
 8 | import numpy
 9 | from sklearn.metrics import f1_score, accuracy_score
10 | from sklearn.multiclass import OneVsRestClassifier
11 | from sklearn.preprocessing import MultiLabelBinarizer
12 | 
13 | # 分类器是使用sklearn的OneVsRestClassifier处理多分类任务。
14 | # 对n类，会建立n个二分类器，每个分类器针对其中一个类别和剩余类别进行分类。
15 | 
16 | class TopKRanker(OneVsRestClassifier):
17 |     #注意这里OneVsRestClassifier
18 |     def predict(self, X, top_k_list):
19 |         #预测分类概率
20 |         probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
21 |         all_labels = []
22 |         for i, k in enumerate(top_k_list): #对所有Y选择概率最大的类别
23 |             probs_ = probs[i, :]
24 |             labels = self.classes_[probs_.argsort()[-k:]].tolist()#排序得到label
25 |             probs_[:] = 0 #one-hot操作，只有label处为1，其他地方都为0
26 |             probs_[labels] = 1
27 |             all_labels.append(probs_)
28 |         return numpy.asarray(all_labels)
29 | 
30 | 
31 | class Classifier(object):
32 | 
33 |     def __init__(self, embeddings, clf):
34 |         self.embeddings = embeddings
35 |         self.clf = TopKRanker(clf)
36 |         self.binarizer = MultiLabelBinarizer(sparse_output=True)
37 | 
38 |     def train(self, X, Y, Y_all):
39 |         self.binarizer.fit(Y_all)
40 |         X_train = [self.embeddings[x] for x in X]
41 |         Y = self.binarizer.transform(Y) #多标签二值化
42 |         self.clf.fit(X_train, Y) #训练分类器
43 | 
44 |     def evaluate(self, X, Y):
45 |         top_k_list = [len(l) for l in Y]
46 |         Y_ = self.predict(X, top_k_list)#预测一个类别
47 |         Y = self.binarizer.transform(Y) #多标签二值化
48 |         averages = ["micro", "macro", "samples", "weighted"]
49 |         results = {}
50 |         for average in averages: #算F1
51 |             results[average] = f1_score(Y, Y_, average=average)
52 |         results['acc'] = accuracy_score(Y,Y_)
53 |         print('-------------------')
54 |         print(results)
55 |         return results
56 |         print('-------------------')
57 | 
58 |     def predict(self, X, top_k_list):
59 |         X_ = numpy.asarray([self.embeddings[x] for x in X])
60 |         Y = self.clf.predict(X_, top_k_list=top_k_list)
61 |         return Y
62 | 
63 |     def split_train_evaluate(self, X, Y, train_precent, seed=0):
64 |         #设定状态，记录下数组被打乱的操作，以使打乱前后实例与标签的一一对应
65 |         state = numpy.random.get_state() 
66 | 
67 |         training_size = int(train_precent * len(X))
68 |         numpy.random.seed(seed) #固定随机种子便于复现结果
69 |         shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
70 |         X_train = [X[shuffle_indices[i]] for i in range(training_size)]
71 |         Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
72 |         #前80训练，后20测试
73 |         X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
74 |         Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
75 | 
76 |         self.train(X_train, Y_train, Y)
77 |         numpy.random.set_state(state)#恢复打乱前的状态
78 |         return self.evaluate(X_test, Y_test)
79 | 
80 | 
81 | def read_node_label(filename, skip_head=False):
82 |     fin = open(filename, 'r')
83 |     X = []
84 |     Y = []
85 |     while 1:
86 |         if skip_head:
87 |             fin.readline()
88 |         l = fin.readline()
89 |         if l == '':
90 |             break
91 |         vec = l.strip().split(' ')
92 |         X.append(vec[0])
93 |         Y.append(vec[1:])
94 |     fin.close()
95 |     return X, Y
96 | 


--------------------------------------------------------------------------------
/node2vec/node2vec.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | from ..walker import RandomWalker
 6 | from gensim.models import Word2Vec
 7 | import pandas as pd
 8 | 
 9 | #Node2vec 可以看作是对 DeepWalk 的广义抽象，主要是改进DeepWalk的随机游走策略。
10 | #逻辑也为先随机游走得到一个“句子”（P和Q控制），然后直接拿句子，gensim训练向量。
11 | 
12 | #参数Q控制选择其他的新顶点的概率，偏广度优先，重视局部，即节点重要性
13 | #参数P控制返回原来顶点的概率，偏深度优先，重视全局，即群体重要性。
14 | 
15 | class Node2Vec:
16 | 
17 |     def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
18 | 
19 |         self.graph = graph
20 |         self._embeddings = {}
21 |         #由p，q控制的游走
22 |         self.walker = RandomWalker(
23 |             graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling)
24 | 
25 |         print("Preprocess transition probs...")
26 |         self.walker.preprocess_transition_probs()
27 | 
28 |         self.sentences = self.walker.simulate_walks(
29 |             num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
30 | 
31 |     def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
32 |         #设定一些关于gensim的参数
33 |         kwargs["sentences"] = self.sentences
34 |         kwargs["min_count"] = kwargs.get("min_count", 0) #词频阈值，这里句子量很少设为0
35 |         kwargs["size"] = embed_size #最后得到128维的节点向量
36 |         kwargs["sg"] = 1  # skip gram的模式来训练
37 |         kwargs["hs"] = 0  # node2vec not use Hierarchical Softmax
38 |         kwargs["workers"] = workers
39 |         kwargs["window"] = window_size
40 |         kwargs["iter"] = iter
41 | 
42 |         print("Learning embedding vectors...")
43 |         model = Word2Vec(**kwargs) #直接用gensim的模型
44 |         print("Learning embedding vectors done!")
45 | 
46 |         self.w2v_model = model
47 | 
48 |         return model
49 | 
50 |     def get_embeddings(self,):
51 |         #得到训练好后的向量
52 |         if self.w2v_model is None:
53 |             print("model not train")
54 |             return {}
55 | 
56 |         self._embeddings = {}
57 |         for word in self.graph.nodes():#建立一个所有节点的向量索引表
58 |             self._embeddings[word] = self.w2v_model.wv[word]
59 | 
60 |         return self._embeddings
61 | 


--------------------------------------------------------------------------------
/node2vec/node2vec_wiki.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 15, 2020
 3 | @author: nakaizura
 4 | '''
 5 | import numpy as np
 6 | 
 7 | from ge.classify import read_node_label, Classifier
 8 | from ge import DeepWalk
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | import matplotlib.pyplot as plt
12 | import networkx as nx
13 | from sklearn.manifold import TSNE
14 | 
15 | #networkx是专门用来存储图，构建图和分析图的库，操作真的超级方便。
16 | 
17 | def evaluate_embeddings(embeddings):
18 |     #读入真实的分类label
19 |     X, Y = read_node_label('../data/wiki/wiki_labels.txt')
20 |     tr_frac = 0.8 #80%的节点用于训练分类器，其余的用于测试
21 |     print("Training classifier using {:.2f}% nodes...".format(
22 |         tr_frac * 100))
23 |     #应用分类器对节点进行分类以评估向量的质量
24 |     clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
25 |     clf.split_train_evaluate(X, Y, tr_frac)
26 | 
27 | 
28 | def plot_embeddings(embeddings,):
29 |     X, Y = read_node_label('../data/wiki/wiki_labels.txt')
30 | 
31 |     emb_list = []
32 |     for k in X:
33 |         emb_list.append(embeddings[k])
34 |     emb_list = np.array(emb_list)
35 | 
36 |     model = TSNE(n_components=2)#用TSNE进行降维
37 |     node_pos = model.fit_transform(emb_list)
38 | 
39 |     color_idx = {}
40 |     for i in range(len(X)): 
41 |         color_idx.setdefault(Y[i][0], []) #类别
42 |         color_idx[Y[i][0]].append(i) #id
43 | 
44 |     for c, idx in color_idx.items(): #不同类别不同颜色
45 |         plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
46 |     plt.legend()
47 |     plt.show()
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     #读入边列表，文件中的每一行有两个节点，表示连接这两个节点的边。
52 |     #直接用networkx读入就行，很方便的操作。
53 |     G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
54 |                          create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
55 | 
56 |     #实例化模型，“句子”长度为10，80次游走等。重要的参数是p=0.25, q=4
57 |     model = Node2Vec(G, walk_length=10, num_walks=80,
58 |                      p=0.25, q=4, workers=1, use_rejection_sampling=0)
59 |     model.train(window_size=5, iter=3) #训练模型，关于gensim w2v的参数都默认在train里面
60 |     embeddings = model.get_embeddings() #得到Embedding向量
61 | 
62 |     evaluate_embeddings(embeddings) #应用节点分类来评估嵌入向量的质量
63 |     plot_embeddings(embeddings) #降成二维画在图中可视化
64 | 


--------------------------------------------------------------------------------