├── tests ├── __init__.py ├── models │ ├── __init__.py │ ├── NCF_test.py │ ├── SDM_test.py │ ├── MIND_test.py │ ├── COMIREC_test.py │ ├── YoutubeDNN_test.py │ ├── FM_test.py │ └── DSSM_test.py └── utils.py ├── docs ├── requirements.readthedocs.txt ├── pics │ ├── SDM.jpg │ ├── code.png │ ├── dssm.jpg │ ├── mind.jpg │ ├── ncf.jpg │ ├── code2.jpg │ ├── neucf.jpg │ ├── comirec.jpg │ ├── deepctrbot.png │ ├── weichennote.png │ ├── youtubednn.jpg │ ├── planet_github.png │ └── movielens_sample.png ├── source │ ├── modules.rst │ ├── deepmatch.utils.rst │ ├── deepmatch.inputs.rst │ ├── deepmatch.models.fm.rst │ ├── deepmatch.layers.core.rst │ ├── deepmatch.models.ncf.rst │ ├── deepmatch.models.sdm.rst │ ├── deepmatch.models.dssm.rst │ ├── deepmatch.models.mind.rst │ ├── deepmatch.models.comirec.rst │ ├── deepmatch.layers.sequence.rst │ ├── deepmatch.layers.interaction.rst │ ├── deepmatch.models.youtubednn.rst │ ├── deepmatch.layers.rst │ ├── Models.rst │ ├── deepmatch.rst │ ├── deepmatch.models.rst │ ├── Quick-Start.md │ ├── History.md │ ├── FAQ.md │ ├── index.rst │ ├── Features.md │ └── conf.py ├── Makefile └── make.bat ├── deepmatch ├── __init__.py ├── models │ ├── __init__.py │ ├── fm.py │ ├── youtubednn.py │ ├── dssm.py │ ├── ncf.py │ ├── sdm.py │ ├── mind.py │ └── comirec.py ├── layers │ ├── __init__.py │ ├── sequence.py │ ├── core.py │ └── interaction.py ├── inputs.py └── utils.py ├── CONTRIBUTING.md ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── question.md │ └── feature_request.md └── workflows │ └── ci.yml ├── setup.py ├── .gitignore ├── examples ├── run_ncf.py ├── run_dssm_negsampling.py ├── run_dssm_inbatchsoftmax.py ├── run_youtubednn.py ├── preprocess.py ├── run_sdm.py └── colab_MovieLen1M_YoutubeDNN.ipynb ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.readthedocs.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.6.2 2 | recommonmark==0.7.1 -------------------------------------------------------------------------------- /docs/pics/SDM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/SDM.jpg -------------------------------------------------------------------------------- /docs/pics/code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/code.png -------------------------------------------------------------------------------- /docs/pics/dssm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/dssm.jpg -------------------------------------------------------------------------------- /docs/pics/mind.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/mind.jpg -------------------------------------------------------------------------------- /docs/pics/ncf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/ncf.jpg -------------------------------------------------------------------------------- /docs/pics/code2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/code2.jpg -------------------------------------------------------------------------------- /docs/pics/neucf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/neucf.jpg -------------------------------------------------------------------------------- /docs/pics/comirec.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/comirec.jpg -------------------------------------------------------------------------------- /docs/pics/deepctrbot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/deepctrbot.png -------------------------------------------------------------------------------- /docs/pics/weichennote.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/weichennote.png -------------------------------------------------------------------------------- /docs/pics/youtubednn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/youtubednn.jpg -------------------------------------------------------------------------------- /docs/pics/planet_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/planet_github.png -------------------------------------------------------------------------------- /docs/pics/movielens_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenweichen/DeepMatch/HEAD/docs/pics/movielens_sample.png -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | deepmatch 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | deepmatch 8 | -------------------------------------------------------------------------------- /deepmatch/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import check_version 2 | 3 | __version__ = '0.3.1' 4 | check_version(__version__) 5 | -------------------------------------------------------------------------------- /docs/source/deepmatch.utils.rst: -------------------------------------------------------------------------------- 1 | deepmatch.utils module 2 | ====================== 3 | 4 | .. automodule:: deepmatch.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.inputs.rst: -------------------------------------------------------------------------------- 1 | deepmatch.inputs module 2 | ======================= 3 | 4 | .. automodule:: deepmatch.inputs 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.fm.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.fm module 2 | ========================== 3 | 4 | .. automodule:: deepmatch.models.fm 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /deepmatch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .fm import FM 2 | from .dssm import DSSM 3 | from .youtubednn import YoutubeDNN 4 | from .ncf import NCF 5 | from .mind import MIND 6 | from .sdm import SDM 7 | from .comirec import ComiRec -------------------------------------------------------------------------------- /docs/source/deepmatch.layers.core.rst: -------------------------------------------------------------------------------- 1 | deepmatch.layers.core module 2 | ============================ 3 | 4 | .. automodule:: deepmatch.layers.core 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.ncf.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.ncf module 2 | =========================== 3 | 4 | .. automodule:: deepmatch.models.ncf 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.sdm.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.sdm module 2 | ============================ 3 | 4 | .. automodule:: deepmatch.models.sdm 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.dssm.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.dssm module 2 | ============================ 3 | 4 | .. automodule:: deepmatch.models.dssm 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.mind.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.mind module 2 | ============================ 3 | 4 | .. automodule:: deepmatch.models.mind 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.comirec.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.comirec module 2 | ============================ 3 | 4 | .. automodule:: deepmatch.models.comirec 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.layers.sequence.rst: -------------------------------------------------------------------------------- 1 | deepmatch.layers.sequence module 2 | ================================ 3 | 4 | .. automodule:: deepmatch.layers.sequence 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.layers.interaction.rst: -------------------------------------------------------------------------------- 1 | deepmatch.layers.interaction module 2 | =================================== 3 | 4 | .. automodule:: deepmatch.layers.interaction 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.youtubednn.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models.youtubednn module 2 | ================================== 3 | 4 | .. automodule:: deepmatch.models.youtubednn 5 | :members: 6 | :no-undoc-members: 7 | :no-show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/deepmatch.layers.rst: -------------------------------------------------------------------------------- 1 | deepmatch.layers package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | deepmatch.layers.core 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: deepmatch.layers 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | This project is under development and we need developers to participate in. 2 | 3 | If you 4 | 5 | - familiar with and interested in matching algorithms 6 | - familiar with tensorflow 7 | - have spare time to learn and develop 8 | - familiar with git 9 | 10 | please send a brief introduction of your background and experience to wcshen1994@163.com, welcome to join us! -------------------------------------------------------------------------------- /docs/source/Models.rst: -------------------------------------------------------------------------------- 1 | DeepMatch Models API 2 | ====================== 3 | 4 | .. toctree:: 5 | Model Methods 6 | FM 7 | DSSM 8 | YoutubeDNN 9 | NCF 10 | SDM 11 | MIND 12 | COMIREC 13 | -------------------------------------------------------------------------------- /docs/source/deepmatch.rst: -------------------------------------------------------------------------------- 1 | deepmatch package 2 | ================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | deepmatch.layers 10 | deepmatch.models 11 | 12 | Submodules 13 | ---------- 14 | 15 | .. toctree:: 16 | 17 | deepmatch.inputs 18 | deepmatch.utils 19 | 20 | Module contents 21 | --------------- 22 | 23 | .. automodule:: deepmatch 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /tests/models/NCF_test.py: -------------------------------------------------------------------------------- 1 | from deepmatch.models import NCF 2 | from ..utils import get_xy_fd_ncf 3 | 4 | 5 | def test_NCF(): 6 | model_name = "NCF" 7 | 8 | x, y, user_feature_columns, item_feature_columns = get_xy_fd_ncf(False) 9 | model = NCF(user_feature_columns, item_feature_columns, ) 10 | 11 | model.compile('adam', "binary_crossentropy") 12 | model.fit(x, y, batch_size=10, epochs=2, validation_split=0.5) 13 | 14 | 15 | if __name__ == "__main__": 16 | pass 17 | -------------------------------------------------------------------------------- /docs/source/deepmatch.models.rst: -------------------------------------------------------------------------------- 1 | deepmatch.models package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | deepmatch.models.dssm 10 | deepmatch.models.fm 11 | deepmatch.models.mind 12 | deepmatch.models.comirec 13 | deepmatch.models.ncf 14 | deepmatch.models.sdm 15 | deepmatch.models.youtubednn 16 | 17 | Module contents 18 | --------------- 19 | 20 | .. automodule:: deepmatch.models 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug(问题描述)** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce(复现步骤)** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Operating environment(运行环境):** 21 | - python version [e.g. 3.6, 3.7, 3.8] 22 | - tensorflow version [e.g. 1.9.0, 1.14.0, 2.5.0] 23 | - deepmatch version [e.g. 0.3.1,] 24 | 25 | **Additional context** 26 | Add any other context about the problem here. 27 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = DeepMatch 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask any question ~ 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | Please refer to the [FAQ](https://deepmatch.readthedocs.io/en/latest/FAQ.html) in doc and search for the [related issues](https://github.com/shenweichen/DeepMatch/issues) before you ask the question. 10 | 11 | **Describe the question(问题描述)** 12 | A clear and concise description of what the question is. 13 | 14 | **Additional context** 15 | Add any other context about the problem here. 16 | 17 | **Operating environment(运行环境):** 18 | - python version [e.g. 3.6, 3.7, 3.8] 19 | - tensorflow version [e.g. 1.9.0, 1.14.0, 2.5.0] 20 | - deepmatch version [e.g. 0.3.1,] 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement&feature request 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /docs/source/Quick-Start.md: -------------------------------------------------------------------------------- 1 | # Quick-Start 2 | 3 | ## Installation Guide 4 | Now `deepmatch` is available for python `2.7 `and `3.6, 3.7, 3.8`. 5 | `deepmatch` depends on tensorflow, you can specify to install the cpu version or gpu version through `pip`. 6 | 7 | ### CPU version 8 | 9 | ```bash 10 | $ pip install deepmatch[cpu] 11 | ``` 12 | ### GPU version 13 | 14 | ```bash 15 | $ pip install deepmatch[gpu] 16 | ``` 17 | ## Run examples !! 18 | 19 | - [Run models on MovieLen1M in Google colab](./Examples.html#run-models-on-movielen1m-in-google-colab) 20 | 21 | - [YoutubeDNN/MIND with sampled softmax](./Examples.html#youtubednn-mind-with-sampled-softmax) 22 | - [SDM with sampled softmax](./Examples.html#sdm-with-sampled-softmax) 23 | - [DSSM with in batch softmax](./Examples.html#dssm-with-in-batch-softmax) 24 | - [DSSM with negative sampling](./Examples.html#dssm-with-negative-sampling) 25 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=DeepMatch 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/History.md: -------------------------------------------------------------------------------- 1 | # History 2 | - 10/31/2022 : [v0.3.1](https://github.com/shenweichen/DeepMatch/releases/tag/v0.3.1) released.Add `ComiRec` model . 3 | - 07/04/2022 : [v0.3.0](https://github.com/shenweichen/DeepMatch/releases/tag/v0.3.0) released.Support different negative sampling strategies, including `inbatch`, `uniform`, `frequency`, `adaptive`. 4 | - 06/17/2022 : [v0.2.1](https://github.com/shenweichen/DeepMatch/releases/tag/v0.2.1) released.Fix some bugs. 5 | - 10/12/2020 : [v0.2.0](https://github.com/shenweichen/DeepMatch/releases/tag/v0.2.0) released.Support different initializers for different embedding weights and loading pretrained embeddings. 6 | - 05/17/2020 : [v0.1.3](https://github.com/shenweichen/DeepMatch/releases/tag/v0.1.3) released.Add `SDM` model . 7 | - 04/10/2020 : [v0.1.2](https://github.com/shenweichen/DeepMatch/releases/tag/v0.1.2) released.Support [saving and loading model](./FAQ.html#save-or-load-weights-models). 8 | - 04/06/2020 : DeepMatch first version is released on [PyPi](https://pypi.org/project/deepmatch/) -------------------------------------------------------------------------------- /tests/models/SDM_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from deepmatch.models import SDM 3 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 4 | from tensorflow.python.keras import backend as K 5 | 6 | from ..utils import check_model, get_xy_fd_sdm 7 | 8 | 9 | def test_SDM(): 10 | model_name = "SDM" 11 | x, y, user_feature_columns, item_feature_columns, history_feature_list = get_xy_fd_sdm(False) 12 | 13 | if tf.__version__ >= '2.0.0': 14 | tf.compat.v1.disable_eager_execution() 15 | #tf.compat.v1.disable_v2_behavior() 16 | else: 17 | K.set_learning_phase(True) 18 | 19 | sampler_config = NegativeSampler(sampler='uniform', num_sampled=2, item_name='item') 20 | model = SDM(user_feature_columns, item_feature_columns, history_feature_list, units=8, 21 | sampler_config=sampler_config) 22 | # model.summary() 23 | 24 | model.compile('adam', sampledsoftmaxloss) 25 | check_model(model, model_name, x, y) 26 | 27 | 28 | if __name__ == "__main__": 29 | pass 30 | -------------------------------------------------------------------------------- /tests/models/MIND_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from deepmatch.models import MIND 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from tensorflow.python.keras import backend as K 6 | 7 | from ..utils import check_model, get_xy_fd 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'dynamic_k,p', 12 | [(False, 1), (True, 100) 13 | ] 14 | ) 15 | def test_MIND(dynamic_k, p): 16 | model_name = "MIND" 17 | 18 | x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) 19 | 20 | if tf.__version__ >= '2.0.0': 21 | tf.compat.v1.disable_eager_execution() 22 | else: 23 | K.set_learning_phase(True) 24 | sampler_config = NegativeSampler(sampler='uniform', num_sampled=2, item_name='item') 25 | model = MIND(user_feature_columns, item_feature_columns, p=p, dynamic_k=dynamic_k, 26 | user_dnn_hidden_units=(16, 4), sampler_config=sampler_config) 27 | 28 | model.compile('adam', sampledsoftmaxloss) 29 | check_model(model, model_name, x, y) 30 | 31 | 32 | if __name__ == "__main__": 33 | pass 34 | -------------------------------------------------------------------------------- /tests/models/COMIREC_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from deepmatch.models import ComiRec 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from tensorflow.python.keras import backend as K 6 | 7 | from tests.utils import check_model, get_xy_fd 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'k_max,p,interest_extractor,add_pos', 12 | [(2, 1, 'sa', True), (1, 100, 'dr', False), (3, 50, 'dr', True), 13 | ] 14 | ) 15 | def test_COMIREC(k_max, p, interest_extractor, add_pos): 16 | model_name = "COMIREC" 17 | 18 | x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) 19 | 20 | if tf.__version__ >= '2.0.0': 21 | tf.compat.v1.disable_eager_execution() 22 | else: 23 | K.set_learning_phase(True) 24 | sampler_config = NegativeSampler(sampler='uniform', num_sampled=2, item_name='item') 25 | model = ComiRec(user_feature_columns, item_feature_columns, k_max=k_max, p=p, interest_extractor=interest_extractor, 26 | add_pos=add_pos, sampler_config=sampler_config) 27 | model.compile('adam', sampledsoftmaxloss) 28 | check_model(model, model_name, x, y) 29 | 30 | 31 | if __name__ == "__main__": 32 | pass 33 | -------------------------------------------------------------------------------- /tests/models/YoutubeDNN_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from deepmatch.models import YoutubeDNN 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from tensorflow.python.keras import backend as K 6 | from tests.utils import check_model, get_xy_fd 7 | 8 | 9 | @pytest.mark.parametrize( 10 | 'sampler', 11 | ['inbatch', 'uniform', 'frequency', 'adaptive', 12 | ] 13 | ) 14 | def test_YoutubeDNN(sampler): 15 | model_name = "YoutubeDNN" 16 | 17 | x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) 18 | 19 | if tf.__version__ >= '2.0.0': 20 | tf.compat.v1.disable_eager_execution() 21 | else: 22 | K.set_learning_phase(True) 23 | from collections import Counter 24 | train_counter = Counter(x['item']) 25 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 26 | sampler_config = NegativeSampler(sampler, num_sampled=2, item_name='item', item_count=item_count, distortion=1.0) 27 | model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(16, 4), 28 | sampler_config=sampler_config) 29 | model.compile('adam', sampledsoftmaxloss) 30 | 31 | check_model(model, model_name, x, y, check_model_io=True) 32 | 33 | 34 | if __name__ == "__main__": 35 | pass 36 | -------------------------------------------------------------------------------- /tests/models/FM_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from deepmatch.models import FM 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from tensorflow.python.keras import backend as K 6 | 7 | from ..utils import check_model, get_xy_fd 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'loss_type', 12 | ['logistic', 'softmax' 13 | ] 14 | ) 15 | def test_FM(loss_type): 16 | model_name = "FM" 17 | 18 | x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) 19 | if tf.__version__ >= '2.0.0': 20 | tf.compat.v1.disable_eager_execution() 21 | else: 22 | K.set_learning_phase(True) 23 | if loss_type == "logistic": 24 | model = FM(user_feature_columns, item_feature_columns, loss_type=loss_type) 25 | model.compile('adam', "binary_crossentropy") 26 | else: 27 | from collections import Counter 28 | item_name = 'item' 29 | train_counter = Counter(x[item_name]) 30 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 31 | sampler_config = NegativeSampler(sampler='inbatch', num_sampled=2, item_name=item_name, item_count=item_count) 32 | model = FM(user_feature_columns, item_feature_columns, loss_type=loss_type, sampler_config=sampler_config) 33 | model.compile('adam', sampledsoftmaxloss) 34 | check_model(model, model_name, x, y) 35 | 36 | 37 | if __name__ == "__main__": 38 | pass 39 | -------------------------------------------------------------------------------- /docs/source/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | 4 | ## 1. Save or load weights/models 5 | ---------------------------------------- 6 | To save/load weights,you can write codes just like any other keras models. 7 | 8 | ```python 9 | model = YoutubeDNN() 10 | model.save_weights('YoutubeDNN_w.h5') 11 | model.load_weights('YoutubeDNN_w.h5') 12 | ``` 13 | 14 | To save/load models,just a little different. 15 | 16 | ```python 17 | from tensorflow.python.keras.models import save_model,load_model 18 | model = DeepFM() 19 | save_model(model, 'YoutubeDNN.h5')# save_model, same as before 20 | 21 | from deepmatch.layers import custom_objects 22 | model = load_model('YoutubeDNN.h5',custom_objects)# load_model,just add a parameter 23 | ``` 24 | 25 | ## 2. Set learning rate and use earlystopping 26 | --------------------------------------------------- 27 | You can use any models in DeepCTR like a keras model object. 28 | Here is a example of how to set learning rate and earlystopping: 29 | 30 | ```python 31 | import deepmatch 32 | from tensorflow.python.keras.optimizers import Adam,Adagrad 33 | from tensorflow.python.keras.callbacks import EarlyStopping 34 | 35 | model = deepmatch.models.FM(user_feature_columns,item_feature_columns) 36 | model.compile(Adagrad(0.01),'binary_crossentropy',metrics=['binary_crossentropy']) 37 | 38 | es = EarlyStopping(monitor='val_binary_crossentropy') 39 | history = model.fit(model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2,callbacks=[es] ) 40 | ``` 41 | -------------------------------------------------------------------------------- /deepmatch/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from deepctr.layers import custom_objects 2 | from deepctr.layers.utils import reduce_sum 3 | 4 | from .core import PoolingLayer, LabelAwareAttention, CapsuleLayer, SampledSoftmaxLayer, EmbeddingIndex, \ 5 | MaskUserEmbedding, InBatchSoftmaxLayer 6 | from .interaction import DotAttention, ConcatAttention, SoftmaxWeightedSum, AttentionSequencePoolingLayer, \ 7 | SelfAttention, \ 8 | SelfMultiHeadAttention, UserAttention 9 | from .sequence import DynamicMultiRNN 10 | from ..utils import sampledsoftmaxloss 11 | 12 | _custom_objects = {'PoolingLayer': PoolingLayer, 13 | 'LabelAwareAttention': LabelAwareAttention, 14 | 'CapsuleLayer': CapsuleLayer, 15 | 'reduce_sum': reduce_sum, 16 | 'SampledSoftmaxLayer': SampledSoftmaxLayer, 17 | 'InBatchSoftmaxLayer': InBatchSoftmaxLayer, 18 | 'sampledsoftmaxloss': sampledsoftmaxloss, 19 | 'EmbeddingIndex': EmbeddingIndex, 20 | 'DotAttention': DotAttention, 21 | 'ConcatAttention': ConcatAttention, 22 | 'SoftmaxWeightedSum': SoftmaxWeightedSum, 23 | 'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer, 24 | 'SelfAttention': SelfAttention, 25 | 'SelfMultiHeadAttention': SelfMultiHeadAttention, 26 | 'UserAttention': UserAttention, 27 | 'DynamicMultiRNN': DynamicMultiRNN, 28 | 'MaskUserEmbedding': MaskUserEmbedding 29 | } 30 | 31 | custom_objects = dict(custom_objects, **_custom_objects) 32 | -------------------------------------------------------------------------------- /tests/models/DSSM_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from deepmatch.models import DSSM 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from tensorflow.python.keras import backend as K 6 | 7 | from ..utils import check_model, get_xy_fd 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'loss_type,user_dnn_hidden_units,item_dnn_hidden_units', 12 | [('logistic', [32, 4], []), ('softmax', [64, 32], [32]) 13 | ] 14 | ) 15 | def test_DSSM(loss_type, user_dnn_hidden_units, item_dnn_hidden_units): 16 | model_name = "DSSM" 17 | 18 | x, y, user_feature_columns, item_feature_columns = get_xy_fd(False) 19 | if tf.__version__ >= '2.0.0': 20 | tf.compat.v1.disable_eager_execution() 21 | else: 22 | K.set_learning_phase(True) 23 | if loss_type == "logistic": 24 | model = DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, 25 | item_dnn_hidden_units=item_dnn_hidden_units, 26 | loss_type=loss_type) 27 | model.compile('adam', "binary_crossentropy") 28 | else: 29 | from collections import Counter 30 | item_name = 'item' 31 | train_counter = Counter(x[item_name]) 32 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 33 | sampler_config = NegativeSampler(sampler='inbatch', num_sampled=2, item_name=item_name, item_count=item_count) 34 | model = DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, 35 | item_dnn_hidden_units=item_dnn_hidden_units, 36 | loss_type=loss_type, sampler_config=sampler_config) 37 | model.compile('adam', sampledsoftmaxloss) 38 | check_model(model, model_name, x, y) 39 | 40 | 41 | if __name__ == "__main__": 42 | pass 43 | -------------------------------------------------------------------------------- /deepmatch/inputs.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat, create_embedding_matrix, embedding_lookup, \ 4 | get_dense_input, varlen_embedding_lookup, get_varlen_pooling_list, mergeDict 5 | 6 | 7 | def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True, 8 | support_dense=True, support_group=False, embedding_matrix_dict=None): 9 | sparse_feature_columns = list( 10 | filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else [] 11 | varlen_sparse_feature_columns = list( 12 | filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else [] 13 | if embedding_matrix_dict is None: 14 | embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix, 15 | seq_mask_zero=seq_mask_zero) 16 | 17 | group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns) 18 | dense_value_list = get_dense_input(features, feature_columns) 19 | if not support_dense and len(dense_value_list) > 0: 20 | raise ValueError("DenseFeat is not supported in dnn_feature_columns") 21 | 22 | sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns) 23 | group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features, 24 | varlen_sparse_feature_columns) 25 | group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict) 26 | if not support_group: 27 | group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values())) 28 | return group_embedding_dict, dense_value_list 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | REQUIRED_PACKAGES = [ 7 | 'requests', "deepctr~=0.9.2" 8 | ] 9 | 10 | setuptools.setup( 11 | name="deepmatch", 12 | version="0.3.1", 13 | author="Weichen Shen", 14 | author_email="weichenswc@163.com", 15 | description="Deep matching model library for recommendations, advertising. It's easy to train models and to **export representation vectors** for user and item which can be used for **ANN search**.", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | url="https://github.com/shenweichen/deepmatch", 19 | download_url='https://github.com/shenweichen/deepmatch/tags', 20 | packages=setuptools.find_packages( 21 | exclude=["tests", "tests.models", "tests.layers"]), 22 | python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*", # '>=3.4', # 3.4.6 23 | install_requires=REQUIRED_PACKAGES, 24 | extras_require={ 25 | "cpu": ["tensorflow>=1.9.0"], 26 | "gpu": ["tensorflow-gpu>=1.9.0"], 27 | }, 28 | entry_points={ 29 | }, 30 | classifiers=( 31 | "License :: OSI Approved :: Apache Software License", 32 | "Operating System :: OS Independent", 33 | 'Intended Audience :: Developers', 34 | 'Intended Audience :: Education', 35 | 'Intended Audience :: Science/Research', 36 | 'Programming Language :: Python :: 3', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Programming Language :: Python :: 3.7', 39 | 'Programming Language :: Python :: 3.8', 40 | 'Programming Language :: Python :: 3.9', 41 | 'Programming Language :: Python :: 3.10', 42 | 'Topic :: Scientific/Engineering', 43 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 44 | 'Topic :: Software Development', 45 | 'Topic :: Software Development :: Libraries', 46 | 'Topic :: Software Development :: Libraries :: Python Modules', 47 | ), 48 | license="Apache-2.0", 49 | keywords=['match', 'matching', 'recommendation' 50 | 'deep learning', 'tensorflow', 'tensor', 'keras'], 51 | ) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /examples/run_ncf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepmatch.models import NCF 3 | from preprocess import gen_data_set, gen_model_input 4 | from sklearn.preprocessing import LabelEncoder 5 | 6 | if __name__ == "__main__": 7 | data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") 8 | sparse_features = ["movie_id", "user_id", 9 | "gender", "age", "occupation", "zip", ] 10 | SEQ_LEN = 50 11 | negsample = 3 12 | 13 | # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` 14 | 15 | features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] 16 | feature_max_idx = {} 17 | for feature in features: 18 | lbe = LabelEncoder() 19 | data[feature] = lbe.fit_transform(data[feature]) + 1 20 | feature_max_idx[feature] = data[feature].max() + 1 21 | 22 | user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') 23 | 24 | item_profile = data[["movie_id"]].drop_duplicates('movie_id') 25 | 26 | user_profile.set_index("user_id", inplace=True) 27 | 28 | user_item_list = data.groupby("user_id")['movie_id'].apply(list) 29 | 30 | train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) 31 | 32 | train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) 33 | test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) 34 | 35 | # 2.count #unique features for each sparse field and generate feature config for sequence feature 36 | 37 | user_feature_columns = {"user_id": feature_max_idx['user_id'], 'gender': feature_max_idx['gender'], 38 | "age": feature_max_idx['age'], 39 | "occupation": feature_max_idx["occupation"], "zip": feature_max_idx["zip"]} 40 | 41 | item_feature_columns = {"movie_id": feature_max_idx['movie_id']} 42 | 43 | # 3.Define Model,train,predict and evaluate 44 | model = NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, 45 | item_gmf_embedding_dim=20, user_mlp_embedding_dim=32, item_mlp_embedding_dim=32, 46 | dnn_hidden_units=[128, 64, 32], ) 47 | model.summary() 48 | model.compile("adam", "binary_crossentropy", 49 | metrics=['binary_crossentropy'], ) 50 | 51 | history = model.fit(train_model_input, train_label, 52 | batch_size=64, epochs=20, verbose=2, validation_split=0.2, ) 53 | pred_ans = model.predict(test_model_input, batch_size=64) 54 | # print("test LogLoss", round(log_loss(test_label, pred_ans), 4)) 55 | # print("test AUC", round(roc_auc_score(test_label, pred_ans), 4)) 56 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. DeepMatch documentation master file, created by 2 | sphinx-quickstart on Sun Apr 5 20:44:18 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to DeepMatch's documentation! 7 | ===================================== 8 | 9 | |Downloads|_ |Stars|_ |Forks|_ |PyPii|_ |Issues|_ |Chat|_ 10 | 11 | .. |Downloads| image:: https://pepy.tech/badge/deepmatch 12 | .. _Downloads: https://pepy.tech/project/deepmatch 13 | 14 | .. |Stars| image:: https://img.shields.io/github/stars/shenweichen/deepmatch.svg 15 | .. _Stars: https://github.com/shenweichen/DeepMatch 16 | 17 | .. |Forks| image:: https://img.shields.io/github/forks/shenweichen/deepmatch.svg 18 | .. _Forks: https://github.com/shenweichen/DeepMatch/fork 19 | 20 | .. |PyPii| image:: https://img.shields.io/pypi/v/deepmatch.svg 21 | .. _PyPii: https://pypi.org/project/deepmatch 22 | 23 | .. |Issues| image:: https://img.shields.io/github/issues/shenweichen/deepmatch.svg 24 | .. _Issues: https://github.com/shenweichen/deepmatch/issues 25 | 26 | .. |Chat| image:: https://img.shields.io/badge/chat-wechat-brightgreen?style=flat 27 | .. _Chat: ./#disscussiongroup 28 | 29 | 30 | DeepMatch is a deep matching model library for recommendations, advertising, and search. It's easy to **train models** and to **export representation vectors** for user and item which can be used for **ANN search**.You can use any complex model with ``model.fit()`` and ``model.predict()`` . 31 | 32 | 33 | Let's `Get Started! <./Quick-Start.html>`_ or `Run examples! `_ 34 | 35 | You can read the latest code at https://github.com/shenweichen/DeepMatch 36 | 37 | News 38 | ----- 39 | 40 | 10/31/2022 : Add `ComiRec` . `Changelog `_ 41 | 42 | 07/04/2022 : Support different negative sampling strategies, including `inbatch` , `uniform` , `frequency` , `adaptive` . `Changelog `_ 43 | 44 | 06/17/2022 : Fix some bugs. `Changelog `_ 45 | 46 | DisscussionGroup 47 | ----------------------- 48 | 49 | 50 | 公众号:**浅梦学习笔记** wechat ID: **deepctrbot** 51 | 52 | `Discussions `_ `学习小组主题集合 `_ 53 | 54 | .. image:: ../pics/code2.jpg 55 | 56 | 57 | .. toctree:: 58 | :maxdepth: 2 59 | :caption: Home: 60 | 61 | Quick-Start 62 | Features 63 | Examples 64 | FAQ 65 | History 66 | 67 | .. toctree:: 68 | :maxdepth: 3 69 | :caption: API: 70 | 71 | Models 72 | 73 | 74 | Indices and tables 75 | ================== 76 | 77 | * :ref:`genindex` 78 | * :ref:`modindex` 79 | * :ref:`search` 80 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | path: 6 | - 'deepmatch/*' 7 | - 'tests/*' 8 | pull_request: 9 | path: 10 | - 'deepmatch/*' 11 | - 'tests/*' 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | timeout-minutes: 120 18 | strategy: 19 | matrix: 20 | python-version: [3.6,3.7,3.8,3.9,3.10.7] 21 | tf-version: [1.9.0,1.14.0,2.5.0] 22 | 23 | exclude: 24 | - python-version: 3.7 25 | tf-version: 1.4.0 26 | - python-version: 3.7 27 | tf-version: 1.9.0 28 | - python-version: 3.7 29 | tf-version: 1.10.0 30 | - python-version: 3.7 31 | tf-version: 1.11.0 32 | - python-version: 3.7 33 | tf-version: 1.12.0 34 | - python-version: 3.7 35 | tf-version: 1.13.0 36 | - python-version: 3.7 37 | tf-version: 1.15.0 38 | - python-version: 3.8 39 | tf-version: 1.4.0 40 | - python-version: 3.8 41 | tf-version: 1.9.0 42 | - python-version: 3.8 43 | tf-version: 1.10.0 44 | - python-version: 3.8 45 | tf-version: 1.11.0 46 | - python-version: 3.8 47 | tf-version: 1.12.0 48 | - python-version: 3.8 49 | tf-version: 1.13.0 50 | - python-version: 3.8 51 | tf-version: 1.14.0 52 | - python-version: 3.8 53 | tf-version: 1.15.0 54 | - python-version: 3.6 55 | tf-version: 2.7.0 56 | - python-version: 3.6 57 | tf-version: 2.8.0 58 | - python-version: 3.6 59 | tf-version: 2.9.0 60 | - python-version: 3.6 61 | tf-version: 2.10.0 62 | - python-version: 3.9 63 | tf-version: 1.4.0 64 | - python-version: 3.9 65 | tf-version: 1.9.0 66 | - python-version: 3.9 67 | tf-version: 1.15.0 68 | - python-version: 3.9 69 | tf-version: 1.14.0 70 | - python-version: 3.10.7 71 | tf-version: 1.4.0 72 | - python-version: 3.10.7 73 | tf-version: 1.9.0 74 | - python-version: 3.10.7 75 | tf-version: 1.15.0 76 | - python-version: 3.10.7 77 | tf-version: 1.14.0 78 | - python-version: 3.10.7 79 | tf-version: 2.5.0 80 | - python-version: 3.10.7 81 | tf-version: 2.6.0 82 | 83 | steps: 84 | 85 | - uses: actions/checkout@v3 86 | 87 | - name: Setup python environment 88 | uses: actions/setup-python@v4 89 | with: 90 | python-version: ${{ matrix.python-version }} 91 | 92 | - name: Install dependencies 93 | run: | 94 | sudo apt update && sudo apt install -y pkg-config libhdf5-dev 95 | pip3 install -q tensorflow==${{ matrix.tf-version }} 96 | pip install -q protobuf==3.19.0 97 | pip install -q requests 98 | pip install -e . 99 | - name: Test with pytest 100 | timeout-minutes: 120 101 | run: | 102 | pip install -q pytest 103 | pip install -q pytest-cov 104 | pip install -q python-coveralls 105 | pytest --cov=deepmatch --cov-report=xml 106 | - name: Upload coverage to Codecov 107 | uses: codecov/codecov-action@v3.1.0 108 | with: 109 | token: ${{secrets.CODECOV_TOKEN}} 110 | file: ./coverage.xml 111 | flags: pytest 112 | name: py${{ matrix.python-version }}-tf${{ matrix.tf-version }} 113 | -------------------------------------------------------------------------------- /deepmatch/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen, wcshenswc@163.com 6 | 7 | """ 8 | 9 | import json 10 | import logging 11 | import requests 12 | from collections import namedtuple 13 | from threading import Thread 14 | 15 | try: 16 | from packaging.version import parse 17 | except ImportError: 18 | from pip._vendor.packaging.version import parse 19 | 20 | import tensorflow as tf 21 | 22 | from tensorflow.python.keras import backend as K 23 | from tensorflow.python.keras.layers import Lambda 24 | 25 | 26 | class NegativeSampler( 27 | namedtuple('NegativeSampler', ['sampler', 'num_sampled', 'item_name', 'item_count', 'distortion'])): 28 | """ NegativeSampler 29 | Args: 30 | sampler: sampler name,['inbatch', 'uniform', 'frequency' 'adaptive',] . 31 | num_sampled: negative samples number per one positive sample. 32 | item_name: pkey of item features . 33 | item_count: global frequency of item . 34 | distortion: skew factor of the unigram probability distribution. 35 | """ 36 | __slots__ = () 37 | 38 | def __new__(cls, sampler, num_sampled, item_name, item_count=None, distortion=1.0, ): 39 | if sampler not in ['inbatch', 'uniform', 'frequency', 'adaptive']: 40 | raise ValueError(' `%s` sampler is not supported ' % sampler) 41 | if sampler in ['inbatch', 'frequency'] and item_count is None: 42 | raise ValueError(' `item_count` must not be `None` when using `inbatch` or `frequency` sampler') 43 | return super(NegativeSampler, cls).__new__(cls, sampler, num_sampled, item_name, item_count, distortion) 44 | 45 | # def __hash__(self): 46 | # return self.sampler.__hash__() 47 | 48 | 49 | def l2_normalize(x, axis=-1): 50 | return Lambda(lambda x: tf.nn.l2_normalize(x, axis))(x) 51 | 52 | 53 | def inner_product(x, y, temperature=1.0): 54 | return Lambda(lambda x: tf.reduce_sum(tf.multiply(x[0], x[1])) / temperature)([x, y]) 55 | 56 | 57 | def recall_N(y_true, y_pred, N=50): 58 | return len(set(y_pred[:N]) & set(y_true)) * 1.0 / len(y_true) 59 | 60 | 61 | def sampledsoftmaxloss(y_true, y_pred): 62 | return K.mean(y_pred) 63 | 64 | 65 | def get_item_embedding(item_embedding, item_input_layer): 66 | return Lambda(lambda x: tf.squeeze(tf.gather(item_embedding, x), axis=1))( 67 | item_input_layer) 68 | 69 | 70 | def check_version(version): 71 | """Return version of package on pypi.python.org using json.""" 72 | 73 | def check(version): 74 | try: 75 | url_pattern = 'https://pypi.python.org/pypi/deepmatch/json' 76 | req = requests.get(url_pattern) 77 | latest_version = parse('0') 78 | version = parse(version) 79 | if req.status_code == requests.codes.ok: 80 | j = json.loads(req.text.encode('utf-8')) 81 | releases = j.get('releases', []) 82 | for release in releases: 83 | ver = parse(release) 84 | if ver.is_prerelease or ver.is_postrelease: 85 | continue 86 | latest_version = max(latest_version, ver) 87 | if latest_version > version: 88 | logging.warning( 89 | '\nDeepMatch version {0} detected. Your version is {1}.\nUse `pip install -U deepmatch` to upgrade.Changelog: https://github.com/shenweichen/DeepMatch/releases/tag/v{0}'.format( 90 | latest_version, version)) 91 | except: 92 | print("Please check the latest version manually on https://pypi.org/project/deepmatch/#history") 93 | return 94 | 95 | Thread(target=check, args=(version,)).start() 96 | -------------------------------------------------------------------------------- /docs/source/Features.md: -------------------------------------------------------------------------------- 1 | # Features 2 | 3 | ## Feature Columns 4 | ### SparseFeat 5 | ``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, dtype, embeddings_initializer, embedding_name, group_name, trainable)`` 6 | 7 | - name : feature name 8 | - vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True` 9 | - embedding_dim : embedding dimension 10 | - use_hash : defualt `False`.If `True` the input will be hashed to space of size `vocabulary_size`. 11 | - dtype : default `int32`.dtype of input tensor. 12 | - embeddings_initializer : initializer for the `embeddings` matrix. 13 | - embedding_name : default `None`. If None, the embedding_name will be same as `name`. 14 | - group_name : feature group of this feature. 15 | - trainable: default `True`.Whether or not the embedding is trainable. 16 | 17 | ### DenseFeat 18 | ``DenseFeat`` is a namedtuple with signature ``DenseFeat(name, dimension, dtype)`` 19 | 20 | - name : feature name 21 | - dimension : dimension of dense feature vector. 22 | - dtype : default `float32`.dtype of input tensor. 23 | 24 | ### VarLenSparseFeat 25 | 26 | ``VarLenSparseFeat`` is a namedtuple with signature ``VarLenSparseFeat(sparsefeat, maxlen, combiner, length_name, weight_name,weight_norm)`` 27 | 28 | - sparsefeat : a instance of `SparseFeat` 29 | - maxlen : maximum length of this feature for all samples 30 | - combiner : pooling method,can be ``sum``,``mean`` or ``max`` 31 | - length_name : feature length name,if `None`, value 0 in feature is for padding. 32 | - weight_name : default `None`. If not None, the sequence feature will be multiplyed by the feature whose name is `weight_name`. 33 | - weight_norm : default `True`. Whether normalize the weight score or not. 34 | 35 | 36 | ## Models 37 | 38 | 39 | ### FM (Convolutional Click Prediction Model) 40 | 41 | 42 | [**FM Model API**](./deepmatch.models.fm.html) 43 | 44 | 45 | [Factorization Machines](https://www.researchgate.net/publication/220766482_Factorization_Machines) 46 | 47 | 48 | ### DSSM (Deep Structured Semantic Model) 49 | 50 | 51 | [**DSSM Model API**](./deepmatch.models.dssm.html) 52 | 53 | ![DSSM](../pics/dssm.jpg) 54 | 55 | 56 | [Deep Structured Semantic Models for Web Search using Clickthrough Data](https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/) 57 | 58 | ### YoutubeDNN 59 | 60 | 61 | [**YoutubeDNN Model API**](./deepmatch.models.youtubednn.html) 62 | 63 | ![YoutubeDNN](../pics/youtubednn.jpg) 64 | 65 | [Deep Neural Networks for YouTube Recommendations](https://www.researchgate.net/publication/307573656_Deep_Neural_Networks_for_YouTube_Recommendations) 66 | 67 | ### NCF (Neural Collaborative Filtering) 68 | 69 | [**NCF Model API**](./deepmatch.models.ncf.html) 70 | 71 | ![NCF](../pics/ncf.jpg) 72 | 73 | [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) 74 | 75 | ### SDM (Sequential Deep Matching Model) 76 | 77 | 78 | 79 | [**SDM Model API**](./deepmatch.models.sdm.html) 80 | 81 | ![SDM](../pics/sdm.jpg) 82 | 83 | [SDM example](https://github.com/shenweichen/DeepMatch/tree/master/examples/run_sdm.py) 84 | 85 | [SDM: Sequential Deep Matching Model for Online Large-scale Recommender System](https://arxiv.org/abs/1909.00385) 86 | 87 | 88 | 89 | ### MIND (Multi-Interest Network with Dynamic routing) 90 | 91 | 92 | 93 | [**MIND Model API**](./deepmatch.models.mind.html) 94 | 95 | ![MIND](../pics/mind.jpg) 96 | 97 | [Multi-interest network with dynamic routing for recommendation at Tmall](https://arxiv.org/pdf/1904.08030) 98 | 99 | ### COMIREC (Controllable Multi-Interest Framework for Recommendation) 100 | 101 | [**COMIREC Model API**](./deepmatch.models.comirec.html) 102 | 103 | ![COMIREC](../pics/comirec.jpg) 104 | 105 | [Controllable Multi-Interest Framework for Recommendation](https://arxiv.org/pdf/2005.09347) -------------------------------------------------------------------------------- /deepmatch/models/fm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Weichen Shen, weichenswc@163.com 4 | 5 | """ 6 | from deepctr.feature_column import build_input_features 7 | from deepctr.layers.core import PredictionLayer 8 | from deepctr.layers.utils import concat_func, reduce_sum 9 | from tensorflow.python.keras.layers import Lambda 10 | from tensorflow.python.keras.models import Model 11 | 12 | from ..inputs import create_embedding_matrix, input_from_feature_columns 13 | from ..layers.core import InBatchSoftmaxLayer 14 | from ..utils import l2_normalize, inner_product 15 | 16 | 17 | def FM(user_feature_columns, item_feature_columns, l2_reg_embedding=1e-6, loss_type='softmax', temperature=0.05, 18 | sampler_config=None, seed=1024, 19 | ): 20 | """Instantiates the FM architecture. 21 | 22 | :param user_feature_columns: An iterable containing user's features used by the model. 23 | :param item_feature_columns: An iterable containing item's features used by the model. 24 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 25 | :param loss_type: string. Loss type. 26 | :param temperature: float. Scaling factor. 27 | :param sampler_config: negative sample config. 28 | :param seed: integer ,to use as random seed. 29 | :return: A Keras model instance. 30 | 31 | """ 32 | 33 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 34 | seed=seed, 35 | seq_mask_zero=True) 36 | 37 | user_features = build_input_features(user_feature_columns) 38 | user_inputs_list = list(user_features.values()) 39 | user_sparse_embedding_list, _ = input_from_feature_columns(user_features, 40 | user_feature_columns, 41 | l2_reg_embedding, seed=seed, 42 | support_dense=False, 43 | embedding_matrix_dict=embedding_matrix_dict) 44 | 45 | item_features = build_input_features(item_feature_columns) 46 | item_inputs_list = list(item_features.values()) 47 | item_sparse_embedding_list, _ = input_from_feature_columns(item_features, 48 | item_feature_columns, 49 | l2_reg_embedding, seed=seed, 50 | support_dense=False, 51 | embedding_matrix_dict=embedding_matrix_dict) 52 | 53 | user_dnn_input = concat_func(user_sparse_embedding_list, axis=1) 54 | user_vector_sum = Lambda(lambda x: reduce_sum(x, axis=1, keep_dims=False))(user_dnn_input) 55 | user_vector_sum = l2_normalize(user_vector_sum) 56 | 57 | item_dnn_input = concat_func(item_sparse_embedding_list, axis=1) 58 | item_vector_sum = Lambda(lambda x: reduce_sum(x, axis=1, keep_dims=False))(item_dnn_input) 59 | item_vector_sum = l2_normalize(item_vector_sum) 60 | 61 | if loss_type == "logistic": 62 | score = inner_product(user_vector_sum, item_vector_sum, temperature) 63 | output = PredictionLayer("binary", False)(score) 64 | 65 | elif loss_type == "softmax": 66 | output = InBatchSoftmaxLayer(sampler_config._asdict(), temperature)( 67 | [user_vector_sum, item_vector_sum, item_features[sampler_config.item_name]]) 68 | else: 69 | raise ValueError(' `loss_type` must be `logistic` or `softmax` ') 70 | 71 | model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) 72 | 73 | model.__setattr__("user_input", user_inputs_list) 74 | model.__setattr__("user_embedding", user_vector_sum) 75 | 76 | model.__setattr__("item_input", item_inputs_list) 77 | model.__setattr__("item_embedding", item_vector_sum) 78 | 79 | return model 80 | -------------------------------------------------------------------------------- /deepmatch/models/youtubednn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Weichen Shen, weichenswc@163.com 4 | Reference: 5 | Covington P, Adams J, Sargin E. Deep neural networks for youtube recommendations[C]//Proceedings of the 10th ACM conference on recommender systems. 2016: 191-198. 6 | """ 7 | from deepctr.feature_column import build_input_features 8 | from deepctr.layers import DNN 9 | from deepctr.layers.utils import NoMask, combined_dnn_input 10 | from tensorflow.python.keras.models import Model 11 | 12 | from ..inputs import input_from_feature_columns, create_embedding_matrix 13 | from ..layers.core import SampledSoftmaxLayer, EmbeddingIndex, PoolingLayer 14 | from ..utils import get_item_embedding, l2_normalize 15 | 16 | 17 | def YoutubeDNN(user_feature_columns, item_feature_columns, 18 | user_dnn_hidden_units=(64, 32), 19 | dnn_activation='relu', dnn_use_bn=False, 20 | l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, output_activation='linear', temperature=0.05, 21 | sampler_config=None, seed=1024): 22 | """Instantiates the YoutubeDNN Model architecture. 23 | 24 | :param user_feature_columns: An iterable containing user's features used by the model. 25 | :param item_feature_columns: An iterable containing item's features used by the model. 26 | :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower 27 | :param dnn_activation: Activation function to use in deep net 28 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net 29 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN 30 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 31 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 32 | :param output_activation: Activation function to use in output layer 33 | :param temperature: float. Scaling factor. 34 | :param sampler_config: negative sample config. 35 | :param seed: integer ,to use as random seed. 36 | :return: A Keras model instance. 37 | 38 | """ 39 | 40 | if len(item_feature_columns) > 1: 41 | raise ValueError("Now YoutubeNN only support 1 item feature like item_id") 42 | item_feature_name = item_feature_columns[0].name 43 | item_vocabulary_size = item_feature_columns[0].vocabulary_size 44 | 45 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 46 | seed=seed) 47 | 48 | user_features = build_input_features(user_feature_columns) 49 | user_inputs_list = list(user_features.values()) 50 | user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, user_feature_columns, 51 | l2_reg_embedding, seed=seed, 52 | embedding_matrix_dict=embedding_matrix_dict) 53 | user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list) 54 | 55 | item_features = build_input_features(item_feature_columns) 56 | item_inputs_list = list(item_features.values()) 57 | user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 58 | dnn_use_bn, output_activation=output_activation, seed=seed)(user_dnn_input) 59 | user_dnn_out = l2_normalize(user_dnn_out) 60 | 61 | item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) 62 | 63 | item_embedding_matrix = embedding_matrix_dict[ 64 | item_feature_name] 65 | item_embedding_weight = NoMask()(item_embedding_matrix(item_index)) 66 | 67 | pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) 68 | 69 | pooling_item_embedding_weight = l2_normalize(pooling_item_embedding_weight) 70 | output = SampledSoftmaxLayer(sampler_config._asdict(), temperature)( 71 | [pooling_item_embedding_weight, user_dnn_out, item_features[item_feature_name]]) 72 | model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) 73 | 74 | model.__setattr__("user_input", user_inputs_list) 75 | model.__setattr__("user_embedding", user_dnn_out) 76 | 77 | model.__setattr__("item_input", item_inputs_list) 78 | model.__setattr__("item_embedding", 79 | get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name])) 80 | 81 | return model 82 | -------------------------------------------------------------------------------- /deepmatch/layers/sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Author: 4 | Weichen Shen,weichenswc@163.com 5 | 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.keras.layers import Layer 10 | 11 | 12 | class DynamicMultiRNN(Layer): 13 | def __init__(self, num_units=None, rnn_type='LSTM', return_sequence=True, num_layers=2, num_residual_layers=1, 14 | dropout_rate=0.2, 15 | forget_bias=1.0, **kwargs): 16 | 17 | self.num_units = num_units 18 | self.return_sequence = return_sequence 19 | self.rnn_type = rnn_type 20 | self.num_layers = num_layers 21 | self.num_residual_layers = num_residual_layers 22 | self.dropout = dropout_rate 23 | self.forget_bias = forget_bias 24 | super(DynamicMultiRNN, self).__init__(**kwargs) 25 | 26 | def build(self, input_shape): 27 | # Create a trainable weight variable for this layer. 28 | input_seq_shape = input_shape[0] 29 | if self.num_units is None: 30 | self.num_units = input_seq_shape.as_list()[-1] 31 | if self.rnn_type == "LSTM": 32 | try: 33 | single_cell = tf.nn.rnn_cell.BasicLSTMCell(self.num_units, forget_bias=self.forget_bias) 34 | except AttributeError: 35 | single_cell = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(self.num_units, forget_bias=self.forget_bias) 36 | elif self.rnn_type == "GRU": 37 | try: 38 | single_cell = tf.nn.rnn_cell.GRUCell(self.num_units, forget_bias=self.forget_bias) 39 | except AttributeError: 40 | single_cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.num_units, forget_bias=self.forget_bias) 41 | else: 42 | raise ValueError("Unknown unit type %s!" % self.rnn_type) 43 | dropout = self.dropout if tf.keras.backend.learning_phase() == 1 else 0 44 | try: 45 | single_cell = tf.nn.rnn_cell.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 46 | except AttributeError: 47 | single_cell = tf.compat.v1.nn.rnn_cell.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) 48 | cell_list = [] 49 | for i in range(self.num_layers): 50 | residual = (i >= self.num_layers - self.num_residual_layers) 51 | if residual: 52 | try: 53 | single_cell_residual = tf.nn.rnn_cell.ResidualWrapper(single_cell) 54 | except AttributeError: 55 | single_cell_residual = tf.compat.v1.nn.rnn_cell.ResidualWrapper(single_cell) 56 | cell_list.append(single_cell_residual) 57 | else: 58 | cell_list.append(single_cell) 59 | if len(cell_list) == 1: 60 | self.final_cell = cell_list[0] 61 | else: 62 | try: 63 | self.final_cell = tf.nn.rnn_cell.MultiRNNCell(cell_list) 64 | except AttributeError: 65 | self.final_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(cell_list) 66 | super(DynamicMultiRNN, self).build(input_shape) 67 | 68 | def call(self, input_list, mask=None, training=None): 69 | rnn_input, sequence_length = input_list 70 | 71 | try: 72 | with tf.name_scope("rnn"), tf.variable_scope("rnn", reuse=tf.AUTO_REUSE): 73 | rnn_output, hidden_state = tf.nn.dynamic_rnn(self.final_cell, inputs=rnn_input, 74 | sequence_length=tf.squeeze(sequence_length), 75 | dtype=tf.float32, scope=self.name) 76 | except AttributeError: 77 | with tf.name_scope("rnn"), tf.compat.v1.variable_scope("rnn", reuse=tf.compat.v1.AUTO_REUSE): 78 | rnn_output, hidden_state = tf.compat.v1.nn.dynamic_rnn(self.final_cell, inputs=rnn_input, 79 | sequence_length=tf.squeeze(sequence_length), 80 | dtype=tf.float32, scope=self.name) 81 | if self.return_sequence: 82 | return rnn_output 83 | else: 84 | return tf.expand_dims(hidden_state, axis=1) 85 | 86 | def compute_output_shape(self, input_shape): 87 | rnn_input_shape = input_shape[0] 88 | if self.return_sequence: 89 | return rnn_input_shape 90 | else: 91 | return (None, 1, rnn_input_shape[2]) 92 | 93 | def get_config(self, ): 94 | config = {'num_units': self.num_units, 'rnn_type': self.rnn_type, 'return_sequence': self.return_sequence, 95 | 'num_layers': self.num_layers, 96 | 'num_residual_layers': self.num_residual_layers, 'dropout_rate': self.dropout, 'forget_bias':self.forget_bias} 97 | base_config = super(DynamicMultiRNN, self).get_config() 98 | return dict(list(base_config.items()) + list(config.items())) 99 | -------------------------------------------------------------------------------- /deepmatch/models/dssm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Zhe Wang, 734914022@qq.com 4 | Weichen Shen, weichenswc@163.com 5 | 6 | Reference: 7 | Huang P S , He X , Gao J , et al. Learning deep structured semantic models for web search using clickthrough data[C]// Acm International Conference on Conference on Information & Knowledge Management. ACM, 2013. 8 | """ 9 | 10 | from deepctr.feature_column import build_input_features, create_embedding_matrix 11 | from deepctr.layers import PredictionLayer, DNN, combined_dnn_input 12 | from tensorflow.python.keras.models import Model 13 | 14 | from ..inputs import input_from_feature_columns 15 | from ..layers.core import InBatchSoftmaxLayer 16 | from ..utils import l2_normalize, inner_product 17 | 18 | 19 | def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32), 20 | item_dnn_hidden_units=(64, 32), 21 | dnn_activation='relu', dnn_use_bn=False, 22 | l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, loss_type='softmax', temperature=0.05, 23 | sampler_config=None, 24 | seed=1024, ): 25 | """Instantiates the Deep Structured Semantic Model architecture. 26 | 27 | :param user_feature_columns: An iterable containing user's features used by the model. 28 | :param item_feature_columns: An iterable containing item's features used by the model. 29 | :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower 30 | :param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower 31 | :param dnn_activation: Activation function to use in deep net 32 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net 33 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN 34 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 35 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 36 | :param loss_type: string. Loss type. 37 | :param temperature: float. Scaling factor. 38 | :param sampler_config: negative sample config. 39 | :param seed: integer ,to use as random seed. 40 | :return: A Keras model instance. 41 | 42 | """ 43 | 44 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 45 | seed=seed, 46 | seq_mask_zero=True) 47 | 48 | user_features = build_input_features(user_feature_columns) 49 | user_inputs_list = list(user_features.values()) 50 | user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features, 51 | user_feature_columns, 52 | l2_reg_embedding, seed=seed, 53 | embedding_matrix_dict=embedding_matrix_dict) 54 | user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list) 55 | 56 | item_features = build_input_features(item_feature_columns) 57 | item_inputs_list = list(item_features.values()) 58 | item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features, 59 | item_feature_columns, 60 | l2_reg_embedding, seed=seed, 61 | embedding_matrix_dict=embedding_matrix_dict) 62 | item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list) 63 | 64 | user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 65 | dnn_use_bn, output_activation='linear', seed=seed)(user_dnn_input) 66 | user_dnn_out = l2_normalize(user_dnn_out) 67 | 68 | if len(item_dnn_hidden_units) > 0: 69 | item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 70 | dnn_use_bn, output_activation='linear', seed=seed)(item_dnn_input) 71 | else: 72 | item_dnn_out = item_dnn_input 73 | item_dnn_out = l2_normalize(item_dnn_out) 74 | 75 | if loss_type == "logistic": 76 | score = inner_product(user_dnn_out, item_dnn_out, temperature) 77 | output = PredictionLayer("binary", False)(score) 78 | 79 | elif loss_type == "softmax": 80 | output = InBatchSoftmaxLayer(sampler_config._asdict(), temperature)( 81 | [user_dnn_out, item_dnn_out, item_features[sampler_config.item_name]]) 82 | else: 83 | raise ValueError(' `loss_type` must be `logistic` or `softmax` ') 84 | 85 | model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) 86 | 87 | model.__setattr__("user_input", user_inputs_list) 88 | model.__setattr__("item_input", item_inputs_list) 89 | model.__setattr__("user_embedding", user_dnn_out) 90 | model.__setattr__("item_embedding", item_dnn_out) 91 | 92 | return model 93 | -------------------------------------------------------------------------------- /examples/run_dssm_negsampling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat 3 | from deepmatch.models import * 4 | from preprocess import gen_data_set, gen_model_input 5 | from sklearn.preprocessing import LabelEncoder 6 | from tensorflow.python.keras.models import Model 7 | 8 | if __name__ == "__main__": 9 | 10 | data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") 11 | sparse_features = ["movie_id", "user_id", 12 | "gender", "age", "occupation", "zip", "genres"] 13 | SEQ_LEN = 50 14 | negsample = 10 15 | 16 | # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` 17 | 18 | feature_max_idx = {} 19 | for feature in sparse_features: 20 | lbe = LabelEncoder() 21 | data[feature] = lbe.fit_transform(data[feature]) + 1 22 | feature_max_idx[feature] = data[feature].max() + 1 23 | 24 | user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') 25 | 26 | item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') 27 | 28 | user_profile.set_index("user_id", inplace=True) 29 | 30 | user_item_list = data.groupby("user_id")['movie_id'].apply(list) 31 | 32 | train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) 33 | 34 | train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) 35 | test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) 36 | 37 | # 2.count #unique features for each sparse field and generate feature config for sequence feature 38 | 39 | embedding_dim = 32 40 | 41 | user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), 42 | SparseFeat("gender", feature_max_idx['gender'], 16), 43 | SparseFeat("age", feature_max_idx['age'], 16), 44 | SparseFeat("occupation", feature_max_idx['occupation'], 16), 45 | SparseFeat("zip", feature_max_idx['zip'], 16), 46 | VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, 47 | embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), 48 | VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, 49 | embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), 50 | ] 51 | 52 | item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), 53 | SparseFeat('genres', feature_max_idx['genres'], embedding_dim) 54 | ] 55 | 56 | # 3.Define Model and train 57 | 58 | import tensorflow as tf 59 | 60 | if tf.__version__ >= '2.0.0': 61 | tf.compat.v1.disable_eager_execution() 62 | else: 63 | K.set_learning_phase(True) 64 | 65 | model = DSSM(user_feature_columns, item_feature_columns, loss_type="logistic") 66 | # model = FM(user_feature_columns,item_feature_columns) 67 | 68 | model.compile(optimizer='adagrad', loss="binary_crossentropy") 69 | 70 | history = model.fit(train_model_input, train_label, 71 | batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) 72 | 73 | # 4. Generate user features for testing and full item features for retrieval 74 | test_user_model_input = test_model_input 75 | all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} 76 | 77 | user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) 78 | item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) 79 | 80 | user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) 81 | item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) 82 | 83 | print(user_embs.shape) 84 | print(item_embs.shape) 85 | 86 | # 5. [Optional] ANN search by faiss and evaluate the result 87 | 88 | # test_true_label = {line[0]:[line[1]] for line in test_set} 89 | # 90 | # import numpy as np 91 | # import faiss 92 | # from tqdm import tqdm 93 | # from deepmatch.utils import recall_N 94 | # 95 | # index = faiss.IndexFlatIP(embedding_dim) 96 | # # faiss.normalize_L2(item_embs) 97 | # index.add(item_embs) 98 | # # faiss.normalize_L2(user_embs) 99 | # D, I = index.search(user_embs, 50) 100 | # s = [] 101 | # hit = 0 102 | # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): 103 | # try: 104 | # pred = [item_profile['movie_id'].values[x] for x in I[i]] 105 | # filter_item = None 106 | # recall_score = recall_N(test_true_label[uid], pred, N=50) 107 | # s.append(recall_score) 108 | # if test_true_label[uid] in pred: 109 | # hit += 1 110 | # except: 111 | # print(i) 112 | # print("recall", np.mean(s)) 113 | # print("hr", hit / len(test_user_model_input['user_id'])) 114 | -------------------------------------------------------------------------------- /examples/run_dssm_inbatchsoftmax.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat 3 | from deepmatch.models import * 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from preprocess import gen_data_set, gen_model_input 6 | from sklearn.preprocessing import LabelEncoder 7 | from tensorflow.python.keras.models import Model 8 | 9 | if __name__ == "__main__": 10 | 11 | data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") 12 | sparse_features = ["movie_id", "user_id", 13 | "gender", "age", "occupation", "zip", "genres"] 14 | SEQ_LEN = 50 15 | negsample = 10 16 | 17 | # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` 18 | 19 | feature_max_idx = {} 20 | for feature in sparse_features: 21 | lbe = LabelEncoder() 22 | data[feature] = lbe.fit_transform(data[feature]) + 1 23 | feature_max_idx[feature] = data[feature].max() + 1 24 | 25 | user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') 26 | 27 | item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') 28 | 29 | user_profile.set_index("user_id", inplace=True) 30 | 31 | user_item_list = data.groupby("user_id")['movie_id'].apply(list) 32 | 33 | train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) 34 | 35 | train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) 36 | test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) 37 | 38 | # 2.count #unique features for each sparse field and generate feature config for sequence feature 39 | 40 | embedding_dim = 32 41 | 42 | user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), 43 | SparseFeat("gender", feature_max_idx['gender'], embedding_dim), 44 | SparseFeat("age", feature_max_idx['age'], embedding_dim), 45 | SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), 46 | SparseFeat("zip", feature_max_idx['zip'], embedding_dim), 47 | VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, 48 | embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), 49 | VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, 50 | embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), 51 | ] 52 | 53 | item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), 54 | SparseFeat('genres', feature_max_idx['genres'], embedding_dim) 55 | ] 56 | 57 | from collections import Counter 58 | 59 | train_counter = Counter(train_model_input['movie_id']) 60 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 61 | sampler_config = NegativeSampler('inbatch', num_sampled=5, item_name='movie_id', item_count=item_count) 62 | 63 | # 3.Define Model and train 64 | 65 | import tensorflow as tf 66 | 67 | if tf.__version__ >= '2.0.0': 68 | tf.compat.v1.disable_eager_execution() 69 | else: 70 | K.set_learning_phase(True) 71 | 72 | model = DSSM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) 73 | # model = FM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) 74 | 75 | model.compile(optimizer='adagrad', loss=sampledsoftmaxloss) 76 | 77 | history = model.fit(train_model_input, train_label, 78 | batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) 79 | 80 | # 4. Generate user features for testing and full item features for retrieval 81 | test_user_model_input = test_model_input 82 | all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} 83 | 84 | user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) 85 | item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) 86 | 87 | user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) 88 | item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) 89 | 90 | print(user_embs.shape) 91 | print(item_embs.shape) 92 | 93 | # 5. [Optional] ANN search by faiss and evaluate the result 94 | 95 | # test_true_label = {line[0]:[line[1]] for line in test_set} 96 | # 97 | # import numpy as np 98 | # import faiss 99 | # from tqdm import tqdm 100 | # from deepmatch.utils import recall_N 101 | # 102 | # index = faiss.IndexFlatIP(embedding_dim) 103 | # # faiss.normalize_L2(item_embs) 104 | # index.add(item_embs) 105 | # # faiss.normalize_L2(user_embs) 106 | # D, I = index.search(user_embs, 50) 107 | # s = [] 108 | # hit = 0 109 | # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): 110 | # try: 111 | # pred = [item_profile['movie_id'].values[x] for x in I[i]] 112 | # filter_item = None 113 | # recall_score = recall_N(test_true_label[uid], pred, N=50) 114 | # s.append(recall_score) 115 | # if test_true_label[uid] in pred: 116 | # hit += 1 117 | # except: 118 | # print(i) 119 | # print("recall", np.mean(s)) 120 | # print("hr", hit / len(test_user_model_input['user_id'])) 121 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('../../')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'DeepMatch' 23 | copyright = '2020-present, Weichen Shen' 24 | author = 'Weichen Shen' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '0.3.1' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.mathjax', 44 | 'sphinx.ext.ifconfig', 45 | 'sphinx.ext.viewcode', 46 | 'sphinx.ext.githubpages', 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = '.rst' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = None 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This pattern also affects html_static_path and html_extra_path . 71 | exclude_patterns = [] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = 'sphinx' 75 | 76 | 77 | # -- Options for HTML output ------------------------------------------------- 78 | 79 | # The theme to use for HTML and HTML Help pages. See the documentation for 80 | # a list of builtin themes. 81 | # 82 | html_theme = 'alabaster' 83 | 84 | # Theme options are theme-specific and customize the look and feel of a theme 85 | # further. For a list of options available for each theme, see the 86 | # documentation. 87 | # 88 | # html_theme_options = {} 89 | 90 | # Add any paths that contain custom static files (such as style sheets) here, 91 | # relative to this directory. They are copied after the builtin static files, 92 | # so a file named "default.css" will overwrite the builtin "default.css". 93 | html_static_path = ['_static'] 94 | 95 | # Custom sidebar templates, must be a dictionary that maps document names 96 | # to template names. 97 | # 98 | # The default sidebars (for documents that don't match any pattern) are 99 | # defined by theme itself. Builtin themes are using these templates by 100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 101 | # 'searchbox.html']``. 102 | # 103 | # html_sidebars = {} 104 | 105 | 106 | # -- Options for HTMLHelp output --------------------------------------------- 107 | 108 | # Output file base name for HTML help builder. 109 | htmlhelp_basename = 'DeepMatchdoc' 110 | 111 | 112 | # -- Options for LaTeX output ------------------------------------------------ 113 | 114 | latex_elements = { 115 | # The paper size ('letterpaper' or 'a4paper'). 116 | # 117 | # 'papersize': 'letterpaper', 118 | 119 | # The font size ('10pt', '11pt' or '12pt'). 120 | # 121 | # 'pointsize': '10pt', 122 | 123 | # Additional stuff for the LaTeX preamble. 124 | # 125 | # 'preamble': '', 126 | 127 | # Latex figure (float) alignment 128 | # 129 | # 'figure_align': 'htbp', 130 | } 131 | 132 | # Grouping the document tree into LaTeX files. List of tuples 133 | # (source start file, target name, title, 134 | # author, documentclass [howto, manual, or own class]). 135 | latex_documents = [ 136 | (master_doc, 'DeepMatch.tex', 'DeepMatch Documentation', 137 | 'Weichen Shen', 'manual'), 138 | ] 139 | 140 | 141 | # -- Options for manual page output ------------------------------------------ 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'deepmatch', 'DeepMatch Documentation', 147 | [author], 1) 148 | ] 149 | 150 | 151 | # -- Options for Texinfo output ---------------------------------------------- 152 | 153 | # Grouping the document tree into Texinfo files. List of tuples 154 | # (source start file, target name, title, author, 155 | # dir menu entry, description, category) 156 | texinfo_documents = [ 157 | (master_doc, 'DeepMatch', 'DeepMatch Documentation', 158 | author, 'DeepMatch', 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | # -- Extension configuration ------------------------------------------------- 164 | todo_include_todos = False 165 | html_theme = 'sphinx_rtd_theme' 166 | 167 | source_parsers = { 168 | '.md': 'recommonmark.parser.CommonMarkParser', 169 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepMatch 2 | 3 | [![Python Versions](https://img.shields.io/pypi/pyversions/deepmatch.svg)](https://pypi.org/project/deepmatch) 4 | [![TensorFlow Versions](https://img.shields.io/badge/TensorFlow-1.9+/2.0+-blue.svg)](https://pypi.org/project/deepmatch) 5 | [![Downloads](https://pepy.tech/badge/deepmatch)](https://pepy.tech/project/deepmatch) 6 | [![PyPI Version](https://img.shields.io/pypi/v/deepmatch.svg)](https://pypi.org/project/deepmatch) 7 | [![GitHub Issues](https://img.shields.io/github/issues/shenweichen/deepmatch.svg 8 | )](https://github.com/shenweichen/deepmatch/issues) 9 | 10 | 11 | 12 | [![Documentation Status](https://readthedocs.org/projects/deepmatch/badge/?version=latest)](https://deepmatch.readthedocs.io/) 13 | ![CI status](https://github.com/shenweichen/deepmatch/workflows/CI/badge.svg) 14 | [![codecov](https://codecov.io/gh/shenweichen/DeepMatch/branch/master/graph/badge.svg)](https://codecov.io/gh/shenweichen/DeepMatch) 15 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/c5a2769ec35444d8958f6b58ff85029b)](https://www.codacy.com/gh/shenweichen/DeepMatch/dashboard?utm_source=github.com&utm_medium=referral&utm_content=shenweichen/DeepMatch&utm_campaign=Badge_Grade) 16 | [![Disscussion](https://img.shields.io/badge/chat-wechat-brightgreen?style=flat)](https://github.com/shenweichen/DeepMatch#disscussiongroup) 17 | [![License](https://img.shields.io/github/license/shenweichen/deepmatch.svg)](https://github.com/shenweichen/deepmatch/blob/master/LICENSE) 18 | 19 | DeepMatch is a deep matching model library for recommendations & advertising. It's easy to **train models** and to **export representation vectors** for user and item which can be used for **ANN search**.You can use any complex model with `model.fit()`and `model.predict()` . 20 | 21 | Let's [**Get Started!**](https://deepmatch.readthedocs.io/en/latest/Quick-Start.html) or [**Run examples**](./examples/colab_MovieLen1M_YoutubeDNN.ipynb) ! 22 | 23 | 24 | 25 | ## Models List 26 | 27 | | Model | Paper | 28 | | :------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------- | 29 | | FM | [ICDM 2010][Factorization Machines](https://www.researchgate.net/publication/220766482_Factorization_Machines) | 30 | | DSSM | [CIKM 2013][Deep Structured Semantic Models for Web Search using Clickthrough Data](https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/) | 31 | | YoutubeDNN | [RecSys 2016][Deep Neural Networks for YouTube Recommendations](https://www.researchgate.net/publication/307573656_Deep_Neural_Networks_for_YouTube_Recommendations) | 32 | | NCF | [WWW 2017][Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) | 33 | | SDM | [CIKM 2019][SDM: Sequential Deep Matching Model for Online Large-scale Recommender System](https://arxiv.org/abs/1909.00385) | 34 | | MIND | [CIKM 2019][Multi-interest network with dynamic routing for recommendation at Tmall](https://arxiv.org/pdf/1904.08030) | 35 | | COMIREC | [KDD 2020][Controllable Multi-Interest Framework for Recommendation](https://arxiv.org/pdf/2005.09347.pdf) | 36 | 37 | ## Contributors([welcome to join us!](./CONTRIBUTING.md)) 38 | 39 | 40 | 41 | 42 | 48 | 53 | 59 | 64 | 69 | 74 | 79 | 80 | 81 |
43 | ​ pic
44 | ​ Shen Weichen ​ 45 |

46 | Alibaba Group

​ 47 |
49 | pic
50 | Wang Zhe ​ 51 |

Baidu Inc.

​ 52 |
54 | ​ pic
55 | ​ Chen Leihui ​ 56 |

57 | Alibaba Group

​ 58 |
60 | ​ pic
61 | LeoCai 62 |

ByteDance

​ 63 |
65 | ​ pic
66 | ​ Li Yuan 67 |

Tencent

​ 68 |
70 | ​ pic
71 | ​ Yang Jieyu 72 |

Ant Group

​ 73 |
75 | ​ pic
76 | ​ Meng Yifan 77 |

DeepCTR

​ 78 |
82 | 83 | ## DisscussionGroup 84 | 85 | - [Github Discussions](https://github.com/shenweichen/DeepMatch/discussions) 86 | - Wechat Discussions 87 | 88 | |公众号:浅梦学习笔记|微信:deepctrbot|学习小组 [加入](https://t.zsxq.com/026UJEuzv) [主题集合](https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MjM5MzY4NzE3MA==&action=getalbum&album_id=1361647041096843265&scene=126#wechat_redirect)| 89 | |:--:|:--:|:--:| 90 | | [![公众号](./docs/pics/code.png)](https://github.com/shenweichen/AlgoNotes)| [![微信](./docs/pics/deepctrbot.png)](https://github.com/shenweichen/AlgoNotes)|[![学习小组](./docs/pics/planet_github.png)](https://t.zsxq.com/026UJEuzv)| 91 | 92 | -------------------------------------------------------------------------------- /examples/run_youtubednn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat 3 | from deepmatch.models import * 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from preprocess import gen_data_set, gen_model_input 6 | from sklearn.preprocessing import LabelEncoder 7 | from tensorflow.python.keras import backend as K 8 | from tensorflow.python.keras.models import Model 9 | 10 | if __name__ == "__main__": 11 | 12 | data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") 13 | data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) 14 | 15 | sparse_features = ["movie_id", "user_id", 16 | "gender", "age", "occupation", "zip", "genres"] 17 | SEQ_LEN = 50 18 | 19 | # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` 20 | 21 | feature_max_idx = {} 22 | for feature in sparse_features: 23 | lbe = LabelEncoder() 24 | data[feature] = lbe.fit_transform(data[feature]) + 1 25 | feature_max_idx[feature] = data[feature].max() + 1 26 | 27 | user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') 28 | 29 | item_profile = data[["movie_id"]].drop_duplicates('movie_id') 30 | 31 | user_profile.set_index("user_id", inplace=True) 32 | 33 | user_item_list = data.groupby("user_id")['movie_id'].apply(list) 34 | 35 | train_set, test_set = gen_data_set(data, SEQ_LEN, 0) 36 | 37 | train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) 38 | test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) 39 | 40 | # 2.count #unique features for each sparse field and generate feature config for sequence feature 41 | 42 | embedding_dim = 16 43 | 44 | user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), 45 | SparseFeat("gender", feature_max_idx['gender'], embedding_dim), 46 | SparseFeat("age", feature_max_idx['age'], embedding_dim), 47 | SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), 48 | SparseFeat("zip", feature_max_idx['zip'], embedding_dim), 49 | VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, 50 | embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), 51 | VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, 52 | embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len') 53 | ] 54 | 55 | item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] 56 | 57 | from collections import Counter 58 | 59 | train_counter = Counter(train_model_input['movie_id']) 60 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 61 | sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) 62 | 63 | # 3.Define Model and train 64 | 65 | import tensorflow as tf 66 | 67 | if tf.__version__ >= '2.0.0': 68 | tf.compat.v1.disable_eager_execution() 69 | else: 70 | K.set_learning_phase(True) 71 | 72 | model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, embedding_dim), 73 | sampler_config=sampler_config) 74 | # model = MIND(user_feature_columns, item_feature_columns, dynamic_k=False, k_max=2, 75 | # user_dnn_hidden_units=(64, embedding_dim), sampler_config=sampler_config) 76 | 77 | model.compile(optimizer="adam", loss=sampledsoftmaxloss) 78 | 79 | history = model.fit(train_model_input, train_label, # train_label, 80 | batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) 81 | 82 | # 4. Generate user features for testing and full item features for retrieval 83 | test_user_model_input = test_model_input 84 | all_item_model_input = {"movie_id": item_profile['movie_id'].values} 85 | 86 | user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) 87 | item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) 88 | 89 | user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) 90 | item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) 91 | 92 | print(user_embs.shape) 93 | print(item_embs.shape) 94 | 95 | # 5. [Optional] ANN search by faiss and evaluate the result 96 | 97 | # import heapq 98 | # from collections import defaultdict 99 | # from tqdm import tqdm 100 | # import numpy as np 101 | # import faiss 102 | # from deepmatch.utils import recall_N 103 | # 104 | # k_max = 2 105 | # topN = 50 106 | # test_true_label = {line[0]: [line[1]] for line in test_set} 107 | # 108 | # index = faiss.IndexFlatIP(embedding_dim) 109 | # # faiss.normalize_L2(item_embs) 110 | # index.add(item_embs) 111 | # # faiss.normalize_L2(user_embs) 112 | # 113 | # if len(user_embs.shape) == 2: # multi interests model's shape = 3 (MIND,ComiRec) 114 | # user_embs = np.expand_dims(user_embs, axis=1) 115 | # 116 | # score_dict = defaultdict(dict) 117 | # for k in range(k_max): 118 | # user_emb = user_embs[:, k, :] 119 | # D, I = index.search(np.ascontiguousarray(user_emb), topN) 120 | # for i, uid in tqdm(enumerate(test_user_model_input['user_id']), total=len(test_user_model_input['user_id'])): 121 | # if np.abs(user_emb[i]).max() < 1e-8: 122 | # continue 123 | # for score, itemid in zip(D[i], I[i]): 124 | # score_dict[uid][itemid] = max(score, score_dict[uid].get(itemid, float("-inf"))) 125 | # 126 | # s = [] 127 | # hit = 0 128 | # for i, uid in enumerate(test_user_model_input['user_id']): 129 | # pred = [item_profile['movie_id'].values[x[0]] for x in 130 | # heapq.nlargest(topN, score_dict[uid].items(), key=lambda x: x[1])] 131 | # filter_item = None 132 | # recall_score = recall_N(test_true_label[uid], pred, N=topN) 133 | # s.append(recall_score) 134 | # if test_true_label[uid] in pred: 135 | # hit += 1 136 | # 137 | # print("recall", np.mean(s)) 138 | # print("hr", hit / len(test_user_model_input['user_id'])) 139 | -------------------------------------------------------------------------------- /deepmatch/models/ncf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Jieyu Yang , yangjieyu@zju.edu.cn 4 | 5 | Reference: 6 | He X, Liao L, Zhang H, et al. Neural collaborative filtering[C]//Proceedings of the 26th international conference on world wide web. 2017: 173-182. 7 | """ 8 | 9 | import math 10 | 11 | from deepctr.feature_column import input_from_feature_columns, build_input_features, SparseFeat 12 | from deepctr.layers import DNN, combined_dnn_input 13 | from tensorflow.python.keras.layers import Lambda, Concatenate, Multiply 14 | from tensorflow.python.keras.models import Model 15 | 16 | 17 | def NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, item_gmf_embedding_dim=20, 18 | user_mlp_embedding_dim=20, item_mlp_embedding_dim=20, dnn_use_bn=False, 19 | dnn_hidden_units=(64, 32), dnn_activation='relu', l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, 20 | seed=1024): 21 | """Instantiates the NCF Model architecture. 22 | 23 | :param user_feature_columns: A dict containing user's features and features'dim. 24 | :param item_feature_columns: A dict containing item's features and features'dim. 25 | :param user_gmf_embedding_dim: int. 26 | :param item_gmf_embedding_dim: int. 27 | :param user_mlp_embedding_dim: int. 28 | :param item_mlp_embedding_dim: int. 29 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net 30 | :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net 31 | :param dnn_activation: Activation function to use in deep net 32 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN 33 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 34 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 35 | :param seed: integer ,to use as random seed. 36 | :return: A Keras model instance. 37 | 38 | """ 39 | 40 | user_dim = len(user_feature_columns) * user_gmf_embedding_dim 41 | item_dim = len(item_feature_columns) * item_gmf_embedding_dim 42 | dim = (user_dim * item_dim) / (math.gcd(user_dim, item_dim)) 43 | user_gmf_embedding_dim = int(dim / len(user_feature_columns)) 44 | item_gmf_embedding_dim = int(dim / len(item_feature_columns)) 45 | 46 | # Generalized Matrix Factorization (GMF) Part 47 | user_gmf_feature_columns = [SparseFeat(feat, vocabulary_size=size, embedding_dim=user_gmf_embedding_dim) 48 | for feat, size in user_feature_columns.items()] 49 | user_features = build_input_features(user_gmf_feature_columns) 50 | user_inputs_list = list(user_features.values()) 51 | user_gmf_sparse_embedding_list, _ = input_from_feature_columns(user_features, 52 | user_gmf_feature_columns, 53 | l2_reg_embedding, seed=seed, 54 | prefix='gmf_') 55 | user_gmf_input = combined_dnn_input(user_gmf_sparse_embedding_list, []) 56 | user_gmf_out = Lambda(lambda x: x, name="user_gmf_embedding")(user_gmf_input) 57 | 58 | item_gmf_feature_columns = [SparseFeat(feat, vocabulary_size=size, embedding_dim=item_gmf_embedding_dim) 59 | for feat, size in item_feature_columns.items()] 60 | item_features = build_input_features(item_gmf_feature_columns) 61 | item_inputs_list = list(item_features.values()) 62 | item_gmf_sparse_embedding_list, _ = input_from_feature_columns(item_features, 63 | item_gmf_feature_columns, 64 | l2_reg_embedding, seed=seed, 65 | prefix='gmf_') 66 | item_gmf_input = combined_dnn_input(item_gmf_sparse_embedding_list, []) 67 | item_gmf_out = Lambda(lambda x: x, name="item_gmf_embedding")(item_gmf_input) 68 | 69 | gmf_out = Multiply()([user_gmf_out, item_gmf_out]) 70 | 71 | # Multi-Layer Perceptron (MLP) Part 72 | user_mlp_feature_columns = [SparseFeat(feat, vocabulary_size=size, embedding_dim=user_mlp_embedding_dim) 73 | for feat, size in user_feature_columns.items()] 74 | user_mlp_sparse_embedding_list, user_mlp_dense_value_list = input_from_feature_columns(user_features, 75 | user_mlp_feature_columns, 76 | l2_reg_embedding, seed=seed, 77 | prefix='mlp_') 78 | user_mlp_input = combined_dnn_input( 79 | user_mlp_sparse_embedding_list, user_mlp_dense_value_list) 80 | user_mlp_out = Lambda(lambda x: x, name="user_mlp_embedding")(user_mlp_input) 81 | 82 | item_mlp_feature_columns = [SparseFeat(feat, vocabulary_size=size, embedding_dim=item_mlp_embedding_dim) 83 | for feat, size in item_feature_columns.items()] 84 | 85 | item_mlp_sparse_embedding_list, item_mlp_dense_value_list = input_from_feature_columns(item_features, 86 | item_mlp_feature_columns, 87 | l2_reg_embedding, seed=seed, 88 | prefix='mlp_') 89 | item_mlp_input = combined_dnn_input( 90 | item_mlp_sparse_embedding_list, item_mlp_dense_value_list) 91 | item_mlp_out = Lambda(lambda x: x, name="item_mlp_embedding")(item_mlp_input) 92 | 93 | mlp_input = Concatenate(axis=1)([user_mlp_out, item_mlp_out]) 94 | mlp_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 95 | dnn_use_bn, seed=seed, name="mlp_embedding")(mlp_input) 96 | 97 | # Fusion of GMF and MLP 98 | neumf_input = Concatenate(axis=1)([gmf_out, mlp_out]) 99 | neumf_out = DNN(hidden_units=[1], activation='sigmoid', seed=seed)(neumf_input) 100 | output = Lambda(lambda x: x, name='neumf_out')(neumf_out) 101 | 102 | # output = PredictionLayer(task, False)(neumf_out) 103 | 104 | model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) 105 | 106 | return model 107 | -------------------------------------------------------------------------------- /examples/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 4 | from tqdm import tqdm 5 | 6 | 7 | def gen_data_set(data, seq_max_len=50, negsample=0): 8 | data.sort_values("timestamp", inplace=True) 9 | item_ids = data['movie_id'].unique() 10 | item_id_genres_map = dict(zip(data['movie_id'].values, data['genres'].values)) 11 | train_set = [] 12 | test_set = [] 13 | for reviewerID, hist in tqdm(data.groupby('user_id')): 14 | pos_list = hist['movie_id'].tolist() 15 | genres_list = hist['genres'].tolist() 16 | rating_list = hist['rating'].tolist() 17 | 18 | if negsample > 0: 19 | candidate_set = list(set(item_ids) - set(pos_list)) 20 | neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True) 21 | for i in range(1, len(pos_list)): 22 | hist = pos_list[:i] 23 | genres_hist = genres_list[:i] 24 | seq_len = min(i, seq_max_len) 25 | if i != len(pos_list) - 1: 26 | train_set.append(( 27 | reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len], 28 | genres_list[i], 29 | rating_list[i])) 30 | for negi in range(negsample): 31 | train_set.append((reviewerID, neg_list[i * negsample + negi], 0, hist[::-1][:seq_len], seq_len, 32 | genres_hist[::-1][:seq_len], item_id_genres_map[neg_list[i * negsample + negi]])) 33 | else: 34 | test_set.append((reviewerID, pos_list[i], 1, hist[::-1][:seq_len], seq_len, genres_hist[::-1][:seq_len], 35 | genres_list[i], 36 | rating_list[i])) 37 | 38 | random.shuffle(train_set) 39 | random.shuffle(test_set) 40 | 41 | print(len(train_set[0]), len(test_set[0])) 42 | 43 | return train_set, test_set 44 | 45 | 46 | def gen_data_set_sdm(data, seq_short_max_len=5, seq_prefer_max_len=50): 47 | data.sort_values("timestamp", inplace=True) 48 | train_set = [] 49 | test_set = [] 50 | for reviewerID, hist in tqdm(data.groupby('user_id')): 51 | pos_list = hist['movie_id'].tolist() 52 | genres_list = hist['genres'].tolist() 53 | rating_list = hist['rating'].tolist() 54 | for i in range(1, len(pos_list)): 55 | hist = pos_list[:i] 56 | genres_hist = genres_list[:i] 57 | seq_short_len = min(i, seq_short_max_len) 58 | seq_prefer_len = min(max(i - seq_short_len, 0), seq_prefer_max_len) 59 | if i != len(pos_list) - 1: 60 | train_set.append( 61 | (reviewerID, pos_list[i], 1, hist[::-1][:seq_short_len][::-1], 62 | hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], seq_short_len, 63 | seq_prefer_len, genres_hist[::-1][:seq_short_len][::-1], 64 | genres_hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], rating_list[i])) 65 | else: 66 | test_set.append( 67 | (reviewerID, pos_list[i], 1, hist[::-1][:seq_short_len][::-1], 68 | hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], seq_short_len, 69 | seq_prefer_len, genres_hist[::-1][:seq_short_len][::-1], 70 | genres_hist[::-1][seq_short_len:seq_short_len + seq_prefer_len], rating_list[i])) 71 | 72 | random.shuffle(train_set) 73 | random.shuffle(test_set) 74 | 75 | print(len(train_set[0]), len(test_set[0])) 76 | 77 | return train_set, test_set 78 | 79 | 80 | def gen_model_input(train_set, user_profile, seq_max_len): 81 | train_uid = np.array([line[0] for line in train_set]) 82 | train_iid = np.array([line[1] for line in train_set]) 83 | train_label = np.array([line[2] for line in train_set]) 84 | train_seq = [line[3] for line in train_set] 85 | train_hist_len = np.array([line[4] for line in train_set]) 86 | train_seq_genres = np.array([line[5] for line in train_set]) 87 | train_genres = np.array([line[6] for line in train_set]) 88 | train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0) 89 | train_seq_genres_pad = pad_sequences(train_seq_genres, maxlen=seq_max_len, padding='post', truncating='post', 90 | value=0) 91 | train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad, 92 | "hist_genres": train_seq_genres_pad, 93 | "hist_len": train_hist_len, "genres": train_genres} 94 | 95 | for key in ["gender", "age", "occupation", "zip"]: 96 | train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values 97 | 98 | return train_model_input, train_label 99 | 100 | 101 | def gen_model_input_sdm(train_set, user_profile, seq_short_max_len, seq_prefer_max_len): 102 | train_uid = np.array([line[0] for line in train_set]) 103 | train_iid = np.array([line[1] for line in train_set]) 104 | train_label = np.array([line[2] for line in train_set]) 105 | short_train_seq = [line[3] for line in train_set] 106 | prefer_train_seq = [line[4] for line in train_set] 107 | train_short_len = np.array([line[5] for line in train_set]) 108 | train_prefer_len = np.array([line[6] for line in train_set]) 109 | short_train_seq_genres = np.array([line[7] for line in train_set]) 110 | prefer_train_seq_genres = np.array([line[8] for line in train_set]) 111 | 112 | train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_max_len, padding='post', truncating='post', 113 | value=0) 114 | train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_max_len, padding='post', 115 | truncating='post', 116 | value=0) 117 | train_short_genres_pad = pad_sequences(short_train_seq_genres, maxlen=seq_short_max_len, padding='post', 118 | truncating='post', 119 | value=0) 120 | train_prefer_genres_pad = pad_sequences(prefer_train_seq_genres, maxlen=seq_prefer_max_len, padding='post', 121 | truncating='post', 122 | value=0) 123 | 124 | train_model_input = {"user_id": train_uid, "movie_id": train_iid, "short_movie_id": train_short_item_pad, 125 | "prefer_movie_id": train_prefer_item_pad, 126 | "prefer_sess_length": train_prefer_len, 127 | "short_sess_length": train_short_len, 'short_genres': train_short_genres_pad, 128 | 'prefer_genres': train_prefer_genres_pad} 129 | 130 | for key in ["gender", "age", "occupation", "zip"]: 131 | train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values 132 | 133 | return train_model_input, train_label 134 | -------------------------------------------------------------------------------- /examples/run_sdm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat 3 | from deepmatch.models import SDM 4 | from deepmatch.utils import sampledsoftmaxloss, NegativeSampler 5 | from preprocess import gen_data_set_sdm, gen_model_input_sdm 6 | from sklearn.preprocessing import LabelEncoder 7 | from tensorflow.python.keras import backend as K 8 | from tensorflow.python.keras.models import Model 9 | 10 | if __name__ == "__main__": 11 | data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") 12 | data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) 13 | 14 | sparse_features = ["movie_id", "user_id", 15 | "gender", "age", "occupation", "zip", "genres"] 16 | SEQ_LEN_short = 5 17 | SEQ_LEN_prefer = 50 18 | 19 | # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` 20 | 21 | feature_max_idx = {} 22 | for feature in sparse_features: 23 | lbe = LabelEncoder() 24 | data[feature] = lbe.fit_transform(data[feature]) + 1 25 | feature_max_idx[feature] = data[feature].max() + 1 26 | 27 | user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') 28 | 29 | item_profile = data[["movie_id"]].drop_duplicates('movie_id') 30 | 31 | user_profile.set_index("user_id", inplace=True) 32 | # 33 | # user_item_list = data.groupby("user_id")['movie_id'].apply(list) 34 | 35 | train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer) 36 | 37 | train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) 38 | test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) 39 | 40 | # 2.count #unique features for each sparse field and generate feature config for sequence feature 41 | 42 | embedding_dim = 32 43 | # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length 44 | user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), 45 | SparseFeat("gender", feature_max_idx['gender'], 16), 46 | SparseFeat("age", feature_max_idx['age'], 16), 47 | SparseFeat("occupation", feature_max_idx['occupation'], 16), 48 | SparseFeat("zip", feature_max_idx['zip'], 16), 49 | VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim, 50 | embedding_name="movie_id"), SEQ_LEN_short, 'mean', 51 | 'short_sess_length'), 52 | VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim, 53 | embedding_name="movie_id"), SEQ_LEN_prefer, 'mean', 54 | 'prefer_sess_length'), 55 | VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim, 56 | embedding_name="genres"), SEQ_LEN_short, 'mean', 57 | 'short_sess_length'), 58 | VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim, 59 | embedding_name="genres"), SEQ_LEN_prefer, 'mean', 60 | 'prefer_sess_length'), 61 | ] 62 | 63 | item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] 64 | 65 | from collections import Counter 66 | 67 | train_counter = Counter(train_model_input['movie_id']) 68 | item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] 69 | sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) 70 | 71 | K.set_learning_phase(True) 72 | 73 | import tensorflow as tf 74 | 75 | if tf.__version__ >= '2.0.0': 76 | tf.compat.v1.disable_eager_execution() 77 | else: 78 | K.set_learning_phase(True) 79 | 80 | # units must be equal to item embedding dim! 81 | model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'], 82 | units=embedding_dim, sampler_config=sampler_config) 83 | 84 | model.compile(optimizer='adam', loss=sampledsoftmaxloss) 85 | 86 | history = model.fit(train_model_input, train_label, # train_label, 87 | batch_size=512, epochs=1, verbose=1, validation_split=0.0, ) 88 | 89 | K.set_learning_phase(False) 90 | # 3.Define Model,train,predict and evaluate 91 | test_user_model_input = test_model_input 92 | all_item_model_input = {"movie_id": item_profile['movie_id'].values, } 93 | 94 | user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) 95 | item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) 96 | 97 | user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) 98 | # user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND 99 | item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) 100 | 101 | print(user_embs.shape) 102 | print(item_embs.shape) 103 | 104 | # #5. [Optional] ANN search by faiss and evaluate the result 105 | # 106 | # import heapq 107 | # from collections import defaultdict 108 | # from tqdm import tqdm 109 | # import numpy as np 110 | # import faiss 111 | # from deepmatch.utils import recall_N 112 | # 113 | # k_max = 1 114 | # topN = 50 115 | # test_true_label = {line[0]: [line[1]] for line in test_set} 116 | # 117 | # index = faiss.IndexFlatIP(embedding_dim) 118 | # # faiss.normalize_L2(item_embs) 119 | # index.add(item_embs) 120 | # # faiss.normalize_L2(user_embs) 121 | # 122 | # if len(user_embs.shape) == 2: # multi interests model's shape = 3 (MIND,ComiRec) 123 | # user_embs = np.expand_dims(user_embs, axis=1) 124 | # 125 | # score_dict = defaultdict(dict) 126 | # for k in range(k_max): 127 | # user_emb = user_embs[:, k, :] 128 | # D, I = index.search(np.ascontiguousarray(user_emb), topN) 129 | # for i, uid in tqdm(enumerate(test_user_model_input['user_id']), total=len(test_user_model_input['user_id'])): 130 | # if np.abs(user_emb[i]).max() < 1e-8: 131 | # continue 132 | # for score, itemid in zip(D[i], I[i]): 133 | # score_dict[uid][itemid] = max(score, score_dict[uid].get(itemid, float("-inf"))) 134 | # 135 | # s = [] 136 | # hit = 0 137 | # for i, uid in enumerate(test_user_model_input['user_id']): 138 | # pred = [item_profile['movie_id'].values[x[0]] for x in 139 | # heapq.nlargest(topN, score_dict[uid].items(), key=lambda x: x[1])] 140 | # filter_item = None 141 | # recall_score = recall_N(test_true_label[uid], pred, N=topN) 142 | # s.append(recall_score) 143 | # if test_true_label[uid] in pred: 144 | # hit += 1 145 | # 146 | # print("recall", np.mean(s)) 147 | # print("hr", hit / len(test_user_model_input['user_id'])) 148 | -------------------------------------------------------------------------------- /deepmatch/models/sdm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Author: 4 | Zhe Wang, 734914022@qq.com 5 | 6 | Reference: 7 | [1] Lv, Fuyu, Jin, Taiwei, Yu, Changlong etc. SDM: Sequential Deep Matching Model for Online Large-scale Recommender System[J]. 8 | """ 9 | 10 | import tensorflow as tf 11 | from deepctr.feature_column import build_input_features, SparseFeat, DenseFeat, get_varlen_pooling_list, \ 12 | VarLenSparseFeat, \ 13 | create_embedding_matrix, embedding_lookup, varlen_embedding_lookup, concat_func 14 | from deepctr.layers.utils import NoMask 15 | from tensorflow.python.keras.layers import Dense, Lambda 16 | from tensorflow.python.keras.models import Model 17 | 18 | from ..layers.core import PoolingLayer, SampledSoftmaxLayer, EmbeddingIndex 19 | from ..layers.interaction import UserAttention, SelfMultiHeadAttention, AttentionSequencePoolingLayer 20 | from ..layers.sequence import DynamicMultiRNN 21 | from ..utils import get_item_embedding, l2_normalize 22 | 23 | 24 | def SDM(user_feature_columns, item_feature_columns, history_feature_list, units=64, rnn_layers=2, 25 | dropout_rate=0.2, 26 | rnn_num_res=1, 27 | num_head=4, l2_reg_embedding=1e-6, dnn_activation='tanh', temperature=0.05, sampler_config=None, seed=1024): 28 | """Instantiates the Sequential Deep Matching Model architecture. 29 | 30 | :param user_feature_columns: An iterable containing user's features used by the model. 31 | :param item_feature_columns: An iterable containing item's features used by the model. 32 | :param history_feature_list: list,to indicate short and prefer sequence sparse field 33 | :param units: int, dimension for each output layer 34 | :param rnn_layers: int, layer number of rnn 35 | :param dropout_rate: float in [0,1), the probability we will drop out a given DNN coordinate. 36 | :param rnn_num_res: int. The number of residual layers in rnn layers 37 | :param num_head: int int, the number of attention head 38 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 39 | :param dnn_activation: Activation function to use in deep net 40 | :param temperature: float. Scaling factor. 41 | :param sampler_config: negative sample config. 42 | :param seed: integer ,to use as random seed. 43 | :return: A Keras model instance. 44 | 45 | """ 46 | 47 | if len(item_feature_columns) > 1: 48 | raise ValueError("Now SDM only support 1 item feature like item_id") 49 | item_feature_column = item_feature_columns[0] 50 | item_feature_name = item_feature_column.name 51 | item_vocabulary_size = item_feature_columns[0].vocabulary_size 52 | 53 | features = build_input_features(user_feature_columns) 54 | 55 | user_inputs_list = list(features.values()) 56 | 57 | sparse_feature_columns = list( 58 | filter(lambda x: isinstance(x, SparseFeat), user_feature_columns)) if user_feature_columns else [] 59 | dense_feature_columns = list( 60 | filter(lambda x: isinstance(x, DenseFeat), user_feature_columns)) if user_feature_columns else [] 61 | if len(dense_feature_columns) != 0: 62 | raise ValueError("Now SDM don't support dense feature") 63 | varlen_sparse_feature_columns = list( 64 | filter(lambda x: isinstance(x, VarLenSparseFeat), user_feature_columns)) if user_feature_columns else [] 65 | 66 | sparse_varlen_feature_columns = [] 67 | prefer_history_columns = [] 68 | short_history_columns = [] 69 | 70 | prefer_fc_names = list(map(lambda x: "prefer_" + x, history_feature_list)) 71 | short_fc_names = list(map(lambda x: "short_" + x, history_feature_list)) 72 | for fc in varlen_sparse_feature_columns: 73 | feature_name = fc.name 74 | if feature_name in prefer_fc_names: 75 | prefer_history_columns.append(fc) 76 | 77 | elif feature_name in short_fc_names: 78 | short_history_columns.append(fc) 79 | else: 80 | sparse_varlen_feature_columns.append(fc) 81 | 82 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 83 | seed=seed) 84 | 85 | item_features = build_input_features(item_feature_columns) 86 | item_inputs_list = list(item_features.values()) 87 | 88 | prefer_emb_list = embedding_lookup(embedding_matrix_dict, features, prefer_history_columns, prefer_fc_names, 89 | prefer_fc_names, to_list=True) # L^u 90 | short_emb_list = embedding_lookup(embedding_matrix_dict, features, short_history_columns, short_fc_names, 91 | short_fc_names, to_list=True) # S^u 92 | # dense_value_list = get_dense_input(features, dense_feature_columns) 93 | user_emb_list = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns, to_list=True) 94 | 95 | sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, sparse_varlen_feature_columns) 96 | sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, sparse_varlen_feature_columns, 97 | to_list=True) 98 | user_emb_list += sequence_embed_list # e^u 99 | # if len(user_emb_list) > 0 or len(dense_value_list) > 0: 100 | # user_emb_feature = combined_dnn_input(user_emb_list, dense_value_list) 101 | user_emb = concat_func(user_emb_list) 102 | user_emb_output = Dense(units, activation=dnn_activation, name="user_emb_output")(user_emb) 103 | 104 | prefer_sess_length = features['prefer_sess_length'] 105 | prefer_att_outputs = [] 106 | for prefer_emb in prefer_emb_list: 107 | prefer_attention_output = AttentionSequencePoolingLayer(dropout_rate=0)( 108 | [user_emb_output, prefer_emb, prefer_sess_length]) 109 | prefer_att_outputs.append(prefer_attention_output) 110 | prefer_att_concat = concat_func(prefer_att_outputs) 111 | prefer_output = Dense(units, activation=dnn_activation, name="prefer_output")(prefer_att_concat) 112 | 113 | short_sess_length = features['short_sess_length'] 114 | short_emb_concat = concat_func(short_emb_list) 115 | short_emb_input = Dense(units, activation=dnn_activation, name="short_emb_input")(short_emb_concat) 116 | 117 | short_rnn_output = DynamicMultiRNN(num_units=units, return_sequence=True, num_layers=rnn_layers, 118 | num_residual_layers=rnn_num_res, 119 | dropout_rate=dropout_rate)([short_emb_input, short_sess_length]) 120 | 121 | short_att_output = SelfMultiHeadAttention(num_units=units, head_num=num_head, dropout_rate=dropout_rate, 122 | future_binding=True, 123 | use_layer_norm=True)( 124 | [short_rnn_output, short_sess_length]) # [batch_size, time, num_units] 125 | 126 | short_output = UserAttention(num_units=units, activation=dnn_activation, use_res=True, dropout_rate=dropout_rate) \ 127 | ([user_emb_output, short_att_output, short_sess_length]) 128 | 129 | gate_input = concat_func([prefer_output, short_output, user_emb_output]) 130 | gate = Dense(units, activation='sigmoid')(gate_input) 131 | 132 | gate_output = Lambda(lambda x: tf.multiply(x[0], x[1]) + tf.multiply(1 - x[0], x[2]))( 133 | [gate, short_output, prefer_output]) 134 | gate_output_reshape = Lambda(lambda x: tf.squeeze(x, 1))(gate_output) 135 | gate_output_reshape = l2_normalize(gate_output_reshape) 136 | 137 | item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) 138 | item_embedding_matrix = embedding_matrix_dict[item_feature_name] 139 | item_embedding_weight = NoMask()(item_embedding_matrix(item_index)) 140 | 141 | pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) 142 | pooling_item_embedding_weight = l2_normalize(pooling_item_embedding_weight) 143 | output = SampledSoftmaxLayer(sampler_config._asdict(), temperature)([ 144 | pooling_item_embedding_weight, gate_output_reshape, item_features[item_feature_name]]) 145 | model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) 146 | 147 | # model.user_input = user_inputs_list 148 | # model.user_embedding = gate_output_reshape 149 | 150 | model.__setattr__("user_input", user_inputs_list) 151 | model.__setattr__("user_embedding", gate_output_reshape) 152 | 153 | # model.item_input = item_inputs_list 154 | # model.item_embedding = get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name]) 155 | 156 | model.__setattr__("item_input", item_inputs_list) 157 | model.__setattr__("item_embedding", 158 | get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name])) 159 | 160 | return model 161 | -------------------------------------------------------------------------------- /deepmatch/models/mind.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Qingliang Cai, leocaicoder@163.com 4 | Weichen Shen, weichenswc@163.com 5 | 6 | Reference: 7 | Li C, Liu Z, Wu M, et al. Multi-interest network with dynamic routing for recommendation at Tmall[C]//Proceedings of the 28th ACM International Conference on Information and Knowledge Management. 2019: 2615-2623. 8 | """ 9 | 10 | import tensorflow as tf 11 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, \ 12 | embedding_lookup, varlen_embedding_lookup, get_varlen_pooling_list, get_dense_input, build_input_features 13 | from deepctr.layers import DNN 14 | from deepctr.layers.utils import NoMask, combined_dnn_input 15 | from tensorflow.python.keras.layers import Concatenate, Lambda 16 | from tensorflow.python.keras.models import Model 17 | 18 | from ..inputs import create_embedding_matrix 19 | from ..layers.core import CapsuleLayer, PoolingLayer, MaskUserEmbedding, LabelAwareAttention, SampledSoftmaxLayer, \ 20 | EmbeddingIndex 21 | from ..utils import get_item_embedding 22 | 23 | 24 | def shape_target(target_emb_tmp, target_emb_size): 25 | return tf.expand_dims(tf.reshape(target_emb_tmp, [-1, target_emb_size]), axis=-1) 26 | 27 | 28 | def tile_user_otherfeat(user_other_feature, k_max): 29 | return tf.tile(tf.expand_dims(user_other_feature, -2), [1, k_max, 1]) 30 | 31 | 32 | def adaptive_interest_num(seq_len, k_max): 33 | try: 34 | log_len = tf.log1p(tf.cast(seq_len, dtype="float32")) 35 | log_2 = tf.log(2.) 36 | except AttributeError: 37 | log_len = tf.math.log1p(tf.cast(seq_len, dtype="float32")) 38 | log_2 = tf.math.log(2.) 39 | k_user = tf.cast(tf.maximum( 40 | 1., 41 | tf.minimum( 42 | tf.cast(k_max, dtype="float32"), # k_max 43 | log_len / log_2 # hist_len 44 | ) 45 | ), dtype="int32") 46 | return k_user 47 | 48 | 49 | def MIND(user_feature_columns, item_feature_columns, k_max=2, p=100, dynamic_k=False, 50 | user_dnn_hidden_units=(64, 32), dnn_activation='relu', dnn_use_bn=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, 51 | dnn_dropout=0, output_activation='linear', sampler_config=None, seed=1024): 52 | """Instantiates the MIND Model architecture. 53 | 54 | :param user_feature_columns: An iterable containing user's features used by the model. 55 | :param item_feature_columns: An iterable containing item's features used by the model. 56 | :param k_max: int, the max size of user interest embedding 57 | :param p: float,the parameter for adjusting the attention distribution in LabelAwareAttention. 58 | :param dynamic_k: bool, whether or not use dynamic interest number 59 | :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower 60 | :param dnn_activation: Activation function to use in deep net 61 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net 62 | :param l2_reg_dnn: L2 regularizer strength applied to DNN 63 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 64 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 65 | :param output_activation: Activation function to use in output layer 66 | :param sampler_config: negative sample config. 67 | :param seed: integer ,to use as random seed. 68 | :return: A Keras model instance. 69 | 70 | """ 71 | 72 | if len(item_feature_columns) > 1: 73 | raise ValueError("Now MIND only support 1 item feature like item_id") 74 | item_feature_column = item_feature_columns[0] 75 | item_feature_name = item_feature_column.name 76 | item_vocabulary_size = item_feature_columns[0].vocabulary_size 77 | item_embedding_dim = item_feature_columns[0].embedding_dim 78 | # item_index = Input(tensor=tf.constant([list(range(item_vocabulary_size))])) 79 | 80 | history_feature_list = [item_feature_name] 81 | 82 | features = build_input_features(user_feature_columns) 83 | sparse_feature_columns = list( 84 | filter(lambda x: isinstance(x, SparseFeat), user_feature_columns)) if user_feature_columns else [] 85 | dense_feature_columns = list( 86 | filter(lambda x: isinstance(x, DenseFeat), user_feature_columns)) if user_feature_columns else [] 87 | varlen_sparse_feature_columns = list( 88 | filter(lambda x: isinstance(x, VarLenSparseFeat), user_feature_columns)) if user_feature_columns else [] 89 | history_feature_columns = [] 90 | sparse_varlen_feature_columns = [] 91 | history_fc_names = list(map(lambda x: "hist_" + x, history_feature_list)) 92 | for fc in varlen_sparse_feature_columns: 93 | feature_name = fc.name 94 | if feature_name in history_fc_names: 95 | history_feature_columns.append(fc) 96 | else: 97 | sparse_varlen_feature_columns.append(fc) 98 | seq_max_len = history_feature_columns[0].maxlen 99 | inputs_list = list(features.values()) 100 | 101 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 102 | seed=seed, prefix="") 103 | 104 | item_features = build_input_features(item_feature_columns) 105 | 106 | query_emb_list = embedding_lookup(embedding_matrix_dict, item_features, item_feature_columns, 107 | history_feature_list, 108 | history_feature_list, to_list=True) 109 | keys_emb_list = embedding_lookup(embedding_matrix_dict, features, history_feature_columns, history_fc_names, 110 | history_fc_names, to_list=True) 111 | dnn_input_emb_list = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns, 112 | mask_feat_list=history_feature_list, to_list=True) 113 | dense_value_list = get_dense_input(features, dense_feature_columns) 114 | 115 | sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, sparse_varlen_feature_columns) 116 | sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, sparse_varlen_feature_columns, 117 | to_list=True) 118 | 119 | dnn_input_emb_list += sequence_embed_list 120 | 121 | # keys_emb = concat_func(keys_emb_list, mask=True) 122 | # query_emb = concat_func(query_emb_list, mask=True) 123 | 124 | history_emb = PoolingLayer()(NoMask()(keys_emb_list)) 125 | target_emb = PoolingLayer()(NoMask()(query_emb_list)) 126 | 127 | # target_emb_size = target_emb.get_shape()[-1].value 128 | # max_len = history_emb.get_shape()[1].value 129 | hist_len = features['hist_len'] 130 | 131 | if dynamic_k: 132 | interest_num = Lambda(adaptive_interest_num, arguments={'k_max': k_max})(hist_len) 133 | high_capsule = CapsuleLayer(input_units=item_embedding_dim, 134 | out_units=item_embedding_dim, max_len=seq_max_len, 135 | k_max=k_max)((history_emb, hist_len, interest_num)) 136 | else: 137 | high_capsule = CapsuleLayer(input_units=item_embedding_dim, 138 | out_units=item_embedding_dim, max_len=seq_max_len, 139 | k_max=k_max)((history_emb, hist_len)) 140 | 141 | if len(dnn_input_emb_list) > 0 or len(dense_value_list) > 0: 142 | user_other_feature = combined_dnn_input(dnn_input_emb_list, dense_value_list) 143 | 144 | other_feature_tile = Lambda(tile_user_otherfeat, arguments={'k_max': k_max})(user_other_feature) 145 | 146 | user_deep_input = Concatenate()([NoMask()(other_feature_tile), high_capsule]) 147 | else: 148 | user_deep_input = high_capsule 149 | 150 | user_embeddings = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, 151 | dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed, 152 | name="user_dnn")( 153 | user_deep_input) 154 | 155 | item_inputs_list = list(item_features.values()) 156 | 157 | item_embedding_matrix = embedding_matrix_dict[item_feature_name] 158 | 159 | item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) 160 | 161 | item_embedding_weight = NoMask()(item_embedding_matrix(item_index)) 162 | 163 | pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) 164 | 165 | if dynamic_k: 166 | user_embeddings = MaskUserEmbedding(k_max)([user_embeddings, interest_num]) 167 | user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p)((user_embeddings, target_emb, interest_num)) 168 | else: 169 | user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p)((user_embeddings, target_emb)) 170 | 171 | output = SampledSoftmaxLayer(sampler_config._asdict())( 172 | [pooling_item_embedding_weight, user_embedding_final, item_features[item_feature_name]]) 173 | model = Model(inputs=inputs_list + item_inputs_list, outputs=output) 174 | 175 | model.__setattr__("user_input", inputs_list) 176 | model.__setattr__("user_embedding", user_embeddings) 177 | 178 | model.__setattr__("item_input", item_inputs_list) 179 | model.__setattr__("item_embedding", 180 | get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name])) 181 | 182 | return model 183 | -------------------------------------------------------------------------------- /deepmatch/models/comirec.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: 3 | Li Yuan, lysysu@qq.com 4 | 5 | Reference: 6 | Yukuo Cen, Jianwei Zhang, Xu Zou, et al. Controllable Multi-Interest Framework for Recommendation//Accepted to KDD 2020 7 | """ 8 | 9 | import tensorflow as tf 10 | from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, \ 11 | embedding_lookup, varlen_embedding_lookup, get_varlen_pooling_list, get_dense_input, build_input_features 12 | from deepctr.layers import DNN, PositionEncoding 13 | from deepctr.layers.utils import NoMask, combined_dnn_input, add_func 14 | from tensorflow.python.keras.layers import Concatenate, Lambda 15 | from tensorflow.python.keras.models import Model 16 | 17 | from ..inputs import create_embedding_matrix 18 | from ..layers.core import CapsuleLayer, PoolingLayer, MaskUserEmbedding, LabelAwareAttention, SampledSoftmaxLayer, \ 19 | EmbeddingIndex 20 | from ..layers.interaction import SoftmaxWeightedSum 21 | from ..utils import get_item_embedding 22 | 23 | 24 | def tile_user_otherfeat(user_other_feature, k_max): 25 | return tf.tile(tf.expand_dims(user_other_feature, -2), [1, k_max, 1]) 26 | 27 | 28 | def tile_user_his_mask(hist_len, seq_max_len, k_max): 29 | return tf.tile(tf.sequence_mask(hist_len, seq_max_len), [1, k_max, 1]) 30 | 31 | 32 | def softmax_Weighted_Sum(input): 33 | history_emb_add_pos, mask, attn = input[0], input[1], input[2] 34 | attn = tf.transpose(attn, [0, 2, 1]) 35 | pad = tf.ones_like(mask, dtype=tf.float32) * (-2 ** 32 + 1) 36 | attn = tf.where(mask, attn, pad) # [batch_size, seq_len, num_interests] 37 | attn = tf.nn.softmax(attn) # [batch_size, seq_len, num_interests] 38 | high_capsule = tf.matmul(attn, history_emb_add_pos) 39 | return high_capsule 40 | 41 | 42 | def ComiRec(user_feature_columns, item_feature_columns, k_max=2, p=100, interest_extractor='sa', 43 | add_pos=True, 44 | user_dnn_hidden_units=(64, 32), dnn_activation='relu', dnn_use_bn=False, l2_reg_dnn=0, 45 | l2_reg_embedding=1e-6, 46 | dnn_dropout=0, output_activation='linear', sampler_config=None, seed=1024): 47 | """Instantiates the ComiRec Model architecture. 48 | 49 | :param user_feature_columns: An iterable containing user's features used by the model. 50 | :param item_feature_columns: An iterable containing item's features used by the model. 51 | :param k_max: int, the max size of user interest embedding 52 | :param p: float,the parameter for adjusting the attention distribution in LabelAwareAttention. 53 | :param interest_extractor: string, type of a multi-interest extraction module, 'sa' means self-attentive and 'dr' means dynamic routing 54 | :param add_pos: bool. Whether use positional encoding layer 55 | :param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower 56 | :param dnn_activation: Activation function to use in deep net 57 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net 58 | :param l2_reg_dnn: L2 regularizer strength applied to DNN 59 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 60 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 61 | :param output_activation: Activation function to use in output layer 62 | :param sampler_config: negative sample config. 63 | :param seed: integer ,to use as random seed. 64 | :return: A Keras model instance. 65 | 66 | """ 67 | 68 | if len(item_feature_columns) > 1: 69 | raise ValueError("Now ComiRec only support 1 item feature like item_id") 70 | if interest_extractor.lower() not in ['dr', 'sa']: 71 | raise ValueError("Now ComiRec only support dr and sa two interest_extractor") 72 | item_feature_column = item_feature_columns[0] 73 | item_feature_name = item_feature_column.name 74 | item_vocabulary_size = item_feature_columns[0].vocabulary_size 75 | item_embedding_dim = item_feature_columns[0].embedding_dim 76 | if user_dnn_hidden_units[-1] != item_embedding_dim: 77 | user_dnn_hidden_units = tuple(list(user_dnn_hidden_units) + [item_embedding_dim]) 78 | # item_index = Input(tensor=tf.constant([list(range(item_vocabulary_size))])) 79 | 80 | history_feature_list = [item_feature_name] 81 | 82 | features = build_input_features(user_feature_columns) 83 | sparse_feature_columns = list( 84 | filter(lambda x: isinstance(x, SparseFeat), user_feature_columns)) if user_feature_columns else [] 85 | dense_feature_columns = list( 86 | filter(lambda x: isinstance(x, DenseFeat), user_feature_columns)) if user_feature_columns else [] 87 | varlen_sparse_feature_columns = list( 88 | filter(lambda x: isinstance(x, VarLenSparseFeat), user_feature_columns)) if user_feature_columns else [] 89 | history_feature_columns = [] 90 | sparse_varlen_feature_columns = [] 91 | history_fc_names = list(map(lambda x: "hist_" + x, history_feature_list)) 92 | for fc in varlen_sparse_feature_columns: 93 | feature_name = fc.name 94 | if feature_name in history_fc_names: 95 | history_feature_columns.append(fc) 96 | else: 97 | sparse_varlen_feature_columns.append(fc) 98 | seq_max_len = history_feature_columns[0].maxlen 99 | inputs_list = list(features.values()) 100 | 101 | embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding, 102 | seed=seed, prefix="") 103 | 104 | item_features = build_input_features(item_feature_columns) 105 | 106 | query_emb_list = embedding_lookup(embedding_matrix_dict, item_features, item_feature_columns, 107 | history_feature_list, 108 | history_feature_list, to_list=True) 109 | keys_emb_list = embedding_lookup(embedding_matrix_dict, features, history_feature_columns, history_fc_names, 110 | history_fc_names, to_list=True) 111 | dnn_input_emb_list = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns, 112 | mask_feat_list=history_feature_list, to_list=True) 113 | dense_value_list = get_dense_input(features, dense_feature_columns) 114 | 115 | sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, sparse_varlen_feature_columns) 116 | sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, sparse_varlen_feature_columns, 117 | to_list=True) 118 | 119 | dnn_input_emb_list += sequence_embed_list 120 | 121 | # keys_emb = concat_func(keys_emb_list, mask=True) 122 | # query_emb = concat_func(query_emb_list, mask=True) 123 | 124 | history_emb = PoolingLayer()(NoMask()(keys_emb_list)) # [None, max_len, emb_dim] 125 | target_emb = PoolingLayer()(NoMask()(query_emb_list)) 126 | 127 | # target_emb_size = target_emb.get_shape()[-1].value 128 | # max_len = history_emb.get_shape()[1].value 129 | hist_len = features['hist_len'] 130 | 131 | high_capsule = None 132 | if interest_extractor.lower() == 'dr': 133 | high_capsule = CapsuleLayer(input_units=item_embedding_dim, 134 | out_units=item_embedding_dim, max_len=seq_max_len, 135 | k_max=k_max)((history_emb, hist_len)) 136 | elif interest_extractor.lower() == 'sa': 137 | history_emb_add_pos = history_emb 138 | if add_pos: 139 | position_embedding = PositionEncoding()(history_emb) 140 | history_emb_add_pos = add_func([history_emb_add_pos, position_embedding]) # [None, max_len, emb_dim] 141 | 142 | attn = DNN((item_embedding_dim * 4, k_max), activation='tanh', l2_reg=l2_reg_dnn, 143 | dropout_rate=dnn_dropout, use_bn=dnn_use_bn, output_activation=None, seed=seed, 144 | name="user_dnn_attn")(history_emb_add_pos) 145 | mask = Lambda(tile_user_his_mask, arguments={'k_max': k_max, 146 | 'seq_max_len': seq_max_len})( 147 | hist_len) # [None, k_max, max_len] 148 | 149 | high_capsule = Lambda(softmax_Weighted_Sum)((history_emb_add_pos, mask, attn)) 150 | 151 | if len(dnn_input_emb_list) > 0 or len(dense_value_list) > 0: 152 | user_other_feature = combined_dnn_input(dnn_input_emb_list, dense_value_list) 153 | other_feature_tile = Lambda(tile_user_otherfeat, arguments={'k_max': k_max})(user_other_feature) 154 | user_deep_input = Concatenate()([NoMask()(other_feature_tile), high_capsule]) 155 | else: 156 | user_deep_input = high_capsule 157 | 158 | user_embeddings = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, 159 | dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed, 160 | name="user_dnn")( 161 | user_deep_input) 162 | 163 | item_inputs_list = list(item_features.values()) 164 | 165 | item_embedding_matrix = embedding_matrix_dict[item_feature_name] 166 | 167 | item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_features[item_feature_name]) 168 | 169 | item_embedding_weight = NoMask()(item_embedding_matrix(item_index)) 170 | 171 | pooling_item_embedding_weight = PoolingLayer()([item_embedding_weight]) 172 | 173 | user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p)((user_embeddings, target_emb)) 174 | 175 | output = SampledSoftmaxLayer(sampler_config._asdict())( 176 | [pooling_item_embedding_weight, user_embedding_final, item_features[item_feature_name]]) 177 | model = Model(inputs=inputs_list + item_inputs_list, outputs=output) 178 | 179 | model.__setattr__("user_input", inputs_list) 180 | model.__setattr__("user_embedding", user_embeddings) 181 | 182 | model.__setattr__("item_input", item_inputs_list) 183 | model.__setattr__("item_embedding", 184 | get_item_embedding(pooling_item_embedding_weight, item_features[item_feature_name])) 185 | 186 | return model 187 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /deepmatch/layers/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Author: 4 | Weichen Shen,weichenswc@163.com 5 | 6 | """ 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | from deepctr.layers.utils import reduce_max, reduce_mean, reduce_sum, concat_func, div, softmax 11 | from tensorflow.python.keras.initializers import Zeros 12 | from tensorflow.python.keras.layers import Layer 13 | 14 | 15 | class PoolingLayer(Layer): 16 | 17 | def __init__(self, mode='mean', supports_masking=False, **kwargs): 18 | 19 | if mode not in ['sum', 'mean', 'max']: 20 | raise ValueError("mode must be sum or mean") 21 | self.mode = mode 22 | self.eps = tf.constant(1e-8, tf.float32) 23 | super(PoolingLayer, self).__init__(**kwargs) 24 | 25 | self.supports_masking = supports_masking 26 | 27 | def build(self, input_shape): 28 | 29 | super(PoolingLayer, self).build( 30 | input_shape) # Be sure to call this somewhere! 31 | 32 | def call(self, seq_value_len_list, mask=None, **kwargs): 33 | if not isinstance(seq_value_len_list, list): 34 | seq_value_len_list = [seq_value_len_list] 35 | if len(seq_value_len_list) == 1: 36 | return seq_value_len_list[0] 37 | expand_seq_value_len_list = list(map(lambda x: tf.expand_dims(x, axis=-1), seq_value_len_list)) 38 | a = concat_func(expand_seq_value_len_list) 39 | if self.mode == "mean": 40 | hist = reduce_mean(a, axis=-1, ) 41 | if self.mode == "sum": 42 | hist = reduce_sum(a, axis=-1, ) 43 | if self.mode == "max": 44 | hist = reduce_max(a, axis=-1, ) 45 | return hist 46 | 47 | def get_config(self, ): 48 | config = {'mode': self.mode, 'supports_masking': self.supports_masking} 49 | base_config = super(PoolingLayer, self).get_config() 50 | return dict(list(base_config.items()) + list(config.items())) 51 | 52 | 53 | class SampledSoftmaxLayer(Layer): 54 | def __init__(self, sampler_config, temperature=1.0, **kwargs): 55 | self.sampler_config = sampler_config 56 | self.temperature = temperature 57 | self.sampler = self.sampler_config['sampler'] 58 | self.item_count = self.sampler_config['item_count'] 59 | 60 | super(SampledSoftmaxLayer, self).__init__(**kwargs) 61 | 62 | def build(self, input_shape): 63 | self.vocabulary_size = input_shape[0][0] 64 | self.zero_bias = self.add_weight(shape=[self.vocabulary_size], 65 | initializer=Zeros, 66 | dtype=tf.float32, 67 | trainable=False, 68 | name="bias") 69 | super(SampledSoftmaxLayer, self).build(input_shape) 70 | 71 | def call(self, inputs_with_item_idx, training=None, **kwargs): 72 | item_embeddings, user_vec, item_idx = inputs_with_item_idx 73 | if item_idx.dtype != tf.int64: 74 | item_idx = tf.cast(item_idx, tf.int64) 75 | user_vec /= self.temperature 76 | if self.sampler == "inbatch": 77 | item_vec = tf.gather(item_embeddings, tf.squeeze(item_idx, axis=1)) 78 | logits = tf.matmul(user_vec, item_vec, transpose_b=True) 79 | loss = inbatch_softmax_cross_entropy_with_logits(logits, self.item_count, item_idx) 80 | 81 | else: 82 | num_sampled = self.sampler_config['num_sampled'] 83 | if self.sampler == "frequency": 84 | sampled_values = tf.nn.fixed_unigram_candidate_sampler(item_idx, 1, num_sampled, True, 85 | self.vocabulary_size, 86 | distortion=self.sampler_config['distortion'], 87 | unigrams=np.maximum(self.item_count, 1).tolist(), 88 | seed=None, 89 | name=None) 90 | elif self.sampler == "adaptive": 91 | sampled_values = tf.nn.learned_unigram_candidate_sampler(item_idx, 1, num_sampled, True, 92 | self.vocabulary_size, seed=None, name=None) 93 | elif self.sampler == "uniform": 94 | try: 95 | sampled_values = tf.nn.uniform_candidate_sampler(item_idx, 1, num_sampled, True, 96 | self.vocabulary_size, seed=None, name=None) 97 | except AttributeError: 98 | sampled_values = tf.random.uniform_candidate_sampler(item_idx, 1, num_sampled, True, 99 | self.vocabulary_size, seed=None, name=None) 100 | else: 101 | raise ValueError(' `%s` sampler is not supported ' % self.sampler) 102 | 103 | loss = tf.nn.sampled_softmax_loss(weights=item_embeddings, 104 | biases=self.zero_bias, 105 | labels=item_idx, 106 | inputs=user_vec, 107 | num_sampled=num_sampled, 108 | num_classes=self.vocabulary_size, 109 | sampled_values=sampled_values 110 | ) 111 | return tf.expand_dims(loss, axis=1) 112 | 113 | def compute_output_shape(self, input_shape): 114 | return (None, 1) 115 | 116 | def get_config(self, ): 117 | config = {'sampler_config': self.sampler_config, 'temperature': self.temperature} 118 | base_config = super(SampledSoftmaxLayer, self).get_config() 119 | return dict(list(base_config.items()) + list(config.items())) 120 | 121 | 122 | class InBatchSoftmaxLayer(Layer): 123 | def __init__(self, sampler_config, temperature=1.0, **kwargs): 124 | self.sampler_config = sampler_config 125 | self.temperature = temperature 126 | self.item_count = self.sampler_config['item_count'] 127 | 128 | super(InBatchSoftmaxLayer, self).__init__(**kwargs) 129 | 130 | def build(self, input_shape): 131 | super(InBatchSoftmaxLayer, self).build(input_shape) 132 | 133 | def call(self, inputs_with_item_idx, training=None, **kwargs): 134 | user_vec, item_vec, item_idx = inputs_with_item_idx 135 | if item_idx.dtype != tf.int64: 136 | item_idx = tf.cast(item_idx, tf.int64) 137 | user_vec /= self.temperature 138 | logits = tf.matmul(user_vec, item_vec, transpose_b=True) 139 | loss = inbatch_softmax_cross_entropy_with_logits(logits, self.item_count, item_idx) 140 | return tf.expand_dims(loss, axis=1) 141 | 142 | def compute_output_shape(self, input_shape): 143 | return (None, 1) 144 | 145 | def get_config(self, ): 146 | config = {'sampler_config': self.sampler_config, 'temperature': self.temperature} 147 | base_config = super(InBatchSoftmaxLayer, self).get_config() 148 | return dict(list(base_config.items()) + list(config.items())) 149 | 150 | 151 | class LabelAwareAttention(Layer): 152 | def __init__(self, k_max, pow_p=1, **kwargs): 153 | self.k_max = k_max 154 | self.pow_p = pow_p 155 | super(LabelAwareAttention, self).__init__(**kwargs) 156 | 157 | def build(self, input_shape): 158 | # Be sure to call this somewhere! 159 | 160 | self.embedding_size = input_shape[0][-1] 161 | super(LabelAwareAttention, self).build(input_shape) 162 | 163 | def call(self, inputs, training=None, **kwargs): 164 | keys = inputs[0] 165 | query = inputs[1] 166 | weight = reduce_sum(keys * query, axis=-1, keep_dims=True) 167 | weight = tf.pow(weight, self.pow_p) # [x,k_max,1] 168 | 169 | if len(inputs) == 3: 170 | k_user = inputs[2] 171 | seq_mask = tf.transpose(tf.sequence_mask(k_user, self.k_max), [0, 2, 1]) 172 | padding = tf.ones_like(seq_mask, dtype=tf.float32) * (-2 ** 32 + 1) # [x,k_max,1] 173 | weight = tf.where(seq_mask, weight, padding) 174 | 175 | if self.pow_p >= 100: 176 | idx = tf.stack( 177 | [tf.range(tf.shape(keys)[0]), tf.squeeze(tf.argmax(weight, axis=1, output_type=tf.int32), axis=1)], 178 | axis=1) 179 | output = tf.gather_nd(keys, idx) 180 | else: 181 | weight = softmax(weight, dim=1, name="weight") 182 | output = tf.reduce_sum(keys * weight, axis=1) 183 | 184 | return output 185 | 186 | def compute_output_shape(self, input_shape): 187 | return (None, self.embedding_size) 188 | 189 | def get_config(self, ): 190 | config = {'k_max': self.k_max, 'pow_p': self.pow_p} 191 | base_config = super(LabelAwareAttention, self).get_config() 192 | return dict(list(base_config.items()) + list(config.items())) 193 | 194 | 195 | class CapsuleLayer(Layer): 196 | def __init__(self, input_units, out_units, max_len, k_max, iteration_times=3, 197 | init_std=1.0, **kwargs): 198 | self.input_units = input_units 199 | self.out_units = out_units 200 | self.max_len = max_len 201 | self.k_max = k_max 202 | self.iteration_times = iteration_times 203 | self.init_std = init_std 204 | super(CapsuleLayer, self).__init__(**kwargs) 205 | 206 | def build(self, input_shape): 207 | self.bilinear_mapping_matrix = self.add_weight(shape=[self.input_units, self.out_units], 208 | name="S", dtype=tf.float32) 209 | super(CapsuleLayer, self).build(input_shape) 210 | 211 | def call(self, inputs, **kwargs): 212 | 213 | behavior_embedding = inputs[0] 214 | seq_len = inputs[1] 215 | batch_size = tf.shape(behavior_embedding)[0] 216 | 217 | mask = tf.reshape(tf.sequence_mask(seq_len, self.max_len, tf.float32), [-1, self.max_len, 1, 1]) 218 | 219 | behavior_embedding_mapping = tf.tensordot(behavior_embedding, self.bilinear_mapping_matrix, axes=1) 220 | behavior_embedding_mapping = tf.expand_dims(behavior_embedding_mapping, axis=2) 221 | 222 | behavior_embdding_mapping_ = tf.stop_gradient(behavior_embedding_mapping) # N,max_len,1,E 223 | try: 224 | routing_logits = tf.truncated_normal([batch_size, self.max_len, self.k_max, 1], stddev=self.init_std) 225 | except AttributeError: 226 | routing_logits = tf.compat.v1.truncated_normal([batch_size, self.max_len, self.k_max, 1], 227 | stddev=self.init_std) 228 | routing_logits = tf.stop_gradient(routing_logits) 229 | 230 | k_user = None 231 | if len(inputs) == 3: 232 | k_user = inputs[2] 233 | interest_mask = tf.sequence_mask(k_user, self.k_max, tf.float32) 234 | interest_mask = tf.reshape(interest_mask, [batch_size, 1, self.k_max, 1]) 235 | interest_mask = tf.tile(interest_mask, [1, self.max_len, 1, 1]) 236 | 237 | interest_padding = tf.ones_like(interest_mask) * -2 ** 31 238 | interest_mask = tf.cast(interest_mask, tf.bool) 239 | 240 | for i in range(self.iteration_times): 241 | if k_user is not None: 242 | routing_logits = tf.where(interest_mask, routing_logits, interest_padding) 243 | try: 244 | weight = softmax(routing_logits, 2) * mask 245 | except TypeError: 246 | weight = tf.transpose(softmax(tf.transpose(routing_logits, [0, 1, 3, 2])), 247 | [0, 1, 3, 2]) * mask # N,max_len,k_max,1 248 | if i < self.iteration_times - 1: 249 | Z = reduce_sum(tf.matmul(weight, behavior_embdding_mapping_), axis=1, keep_dims=True) # N,1,k_max,E 250 | interest_capsules = squash(Z) 251 | delta_routing_logits = reduce_sum( 252 | interest_capsules * behavior_embdding_mapping_, 253 | axis=-1, keep_dims=True 254 | ) 255 | routing_logits += delta_routing_logits 256 | else: 257 | Z = reduce_sum(tf.matmul(weight, behavior_embedding_mapping), axis=1, keep_dims=True) 258 | interest_capsules = squash(Z) 259 | 260 | interest_capsules = tf.reshape(interest_capsules, [-1, self.k_max, self.out_units]) 261 | return interest_capsules 262 | 263 | def compute_output_shape(self, input_shape): 264 | return (None, self.k_max, self.out_units) 265 | 266 | def get_config(self, ): 267 | config = {'input_units': self.input_units, 'out_units': self.out_units, 'max_len': self.max_len, 268 | 'k_max': self.k_max, 'iteration_times': self.iteration_times, "init_std": self.init_std} 269 | base_config = super(CapsuleLayer, self).get_config() 270 | return dict(list(base_config.items()) + list(config.items())) 271 | 272 | 273 | def squash(inputs): 274 | vec_squared_norm = reduce_sum(tf.square(inputs), axis=-1, keep_dims=True) 275 | scalar_factor = vec_squared_norm / (1 + vec_squared_norm) / tf.sqrt(vec_squared_norm + 1e-9) 276 | vec_squashed = scalar_factor * inputs 277 | return vec_squashed 278 | 279 | 280 | def inbatch_softmax_cross_entropy_with_logits(logits, item_count, item_idx): 281 | Q = tf.gather(tf.constant(item_count / np.sum(item_count), 'float32'), 282 | tf.squeeze(item_idx, axis=1)) 283 | try: 284 | logQ = tf.reshape(tf.math.log(Q), (1, -1)) 285 | logits -= logQ # subtract_log_q 286 | labels = tf.linalg.diag(tf.ones_like(logits[0])) 287 | except AttributeError: 288 | logQ = tf.reshape(tf.log(Q), (1, -1)) 289 | logits -= logQ # subtract_log_q 290 | labels = tf.diag(tf.ones_like(logits[0])) 291 | 292 | loss = tf.nn.softmax_cross_entropy_with_logits( 293 | labels=labels, logits=logits) 294 | return loss 295 | 296 | 297 | class EmbeddingIndex(Layer): 298 | 299 | def __init__(self, index, **kwargs): 300 | self.index = index 301 | super(EmbeddingIndex, self).__init__(**kwargs) 302 | 303 | def build(self, input_shape): 304 | super(EmbeddingIndex, self).build( 305 | input_shape) # Be sure to call this somewhere! 306 | 307 | def call(self, x, **kwargs): 308 | return tf.constant(self.index) 309 | 310 | def get_config(self, ): 311 | config = {'index': self.index, } 312 | base_config = super(EmbeddingIndex, self).get_config() 313 | return dict(list(base_config.items()) + list(config.items())) 314 | 315 | 316 | class MaskUserEmbedding(Layer): 317 | 318 | def __init__(self, k_max, **kwargs): 319 | self.k_max = k_max 320 | super(MaskUserEmbedding, self).__init__(**kwargs) 321 | 322 | def build(self, input_shape): 323 | super(MaskUserEmbedding, self).build( 324 | input_shape) # Be sure to call this somewhere! 325 | 326 | def call(self, x, training=None, **kwargs): 327 | user_embedding, interest_num = x 328 | if not training: 329 | interest_mask = tf.sequence_mask(interest_num, self.k_max, tf.float32) 330 | interest_mask = tf.reshape(interest_mask, [-1, self.k_max, 1]) 331 | user_embedding *= interest_mask 332 | return user_embedding 333 | 334 | def get_config(self, ): 335 | config = {'k_max': self.k_max, } 336 | base_config = super(MaskUserEmbedding, self).get_config() 337 | return dict(list(base_config.items()) + list(config.items())) 338 | -------------------------------------------------------------------------------- /examples/colab_MovieLen1M_YoutubeDNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "rtox72csOQUN" 7 | }, 8 | "source": [ 9 | "# DeepMatch 样例代码\n", 10 | "- https://github.com/shenweichen/DeepMatch\n", 11 | "- https://deepmatch.readthedocs.io/en/latest/" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "id": "bTWHz-heMkyw" 18 | }, 19 | "source": [ 20 | "# 下载movielens-1M数据 安装依赖包" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "colab": { 28 | "base_uri": "https://localhost:8080/" 29 | }, 30 | "id": "yTl6d6jO1oqf", 31 | "outputId": "ee7303f1-8970-4726-a9f1-368798077228" 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip \n", 36 | "! wget https://raw.githubusercontent.com/shenweichen/DeepMatch/master/examples/preprocess.py -O preprocess.py\n", 37 | "! unzip -o ml-1m.zip \n", 38 | "! pip uninstall -y -q tensorflow\n", 39 | "! pip install -q tensorflow-gpu==2.5.0\n", 40 | "! pip install -q deepmatch" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "p9UxNHuPMuW2" 47 | }, 48 | "source": [ 49 | "# 导入需要的库" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 1, 55 | "metadata": { 56 | "id": "C_ZR6gzp1E2N" 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import pandas as pd\n", 61 | "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", 62 | "from preprocess import gen_data_set, gen_model_input\n", 63 | "from sklearn.preprocessing import LabelEncoder\n", 64 | "from tensorflow.python.keras import backend as K\n", 65 | "from tensorflow.python.keras.models import Model\n", 66 | "\n", 67 | "from deepmatch.models import *\n", 68 | "from deepmatch.utils import sampledsoftmaxloss, NegativeSampler" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "fQq6O9XAMzPF" 75 | }, 76 | "source": [ 77 | "# 读取数据" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": { 84 | "colab": { 85 | "base_uri": "https://localhost:8080/" 86 | }, 87 | "id": "lcO29zFb21Od", 88 | "outputId": "bfeed1ac-99f2-425f-dda6-10b83be721fe" 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 96 | " after removing the cwd from sys.path.\n", 97 | "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 98 | " \n", 99 | "/Users/swc/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 100 | " \n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "data_path = \"./\"\n", 106 | "\n", 107 | "unames = ['user_id','gender','age','occupation','zip']\n", 108 | "user = pd.read_csv(data_path+'ml-1m/users.dat',sep='::',header=None,names=unames)\n", 109 | "rnames = ['user_id','movie_id','rating','timestamp']\n", 110 | "ratings = pd.read_csv(data_path+'ml-1m/ratings.dat',sep='::',header=None,names=rnames)\n", 111 | "mnames = ['movie_id','title','genres']\n", 112 | "movies = pd.read_csv(data_path+'ml-1m/movies.dat',sep='::',header=None,names=mnames,encoding=\"unicode_escape\")\n", 113 | "movies['genres'] = list(map(lambda x: x.split('|')[0], movies['genres'].values))\n", 114 | "\n", 115 | "data = pd.merge(pd.merge(ratings,movies),user)#.iloc[:10000]" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "L0yCWxQxM3se" 122 | }, 123 | "source": [ 124 | "# 构建特征列,训练模型,导出embedding" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 3, 130 | "metadata": { 131 | "colab": { 132 | "base_uri": "https://localhost:8080/" 133 | }, 134 | "id": "BMOvk_de2ML3", 135 | "outputId": "962afe1c-d387-4345-861f-e9b974a0b495" 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stderr", 140 | "output_type": "stream", 141 | "text": [ 142 | "100%|██████████| 6040/6040 [00:12<00:00, 488.35it/s]\n" 143 | ] 144 | }, 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "8 8\n", 150 | "Train on 988129 samples\n", 151 | "Epoch 1/20\n", 152 | "988129/988129 [==============================] - 38s 39us/sample - loss: 5.6344\n", 153 | "Epoch 2/20\n", 154 | "988129/988129 [==============================] - 41s 41us/sample - loss: 4.6947\n", 155 | "Epoch 3/20\n", 156 | "988129/988129 [==============================] - 39s 39us/sample - loss: 4.4681\n", 157 | "Epoch 4/20\n", 158 | "988129/988129 [==============================] - 38s 38us/sample - loss: 4.3227\n", 159 | "Epoch 5/20\n", 160 | "988129/988129 [==============================] - 38s 38us/sample - loss: 4.2224\n", 161 | "Epoch 6/20\n", 162 | "988129/988129 [==============================] - 37s 37us/sample - loss: 4.1463\n", 163 | "Epoch 7/20\n", 164 | "988129/988129 [==============================] - 37s 37us/sample - loss: 4.0843\n", 165 | "Epoch 8/20\n", 166 | "988129/988129 [==============================] - 37s 38us/sample - loss: 4.0339\n", 167 | "Epoch 9/20\n", 168 | "988129/988129 [==============================] - 44s 44us/sample - loss: 3.9941\n", 169 | "Epoch 10/20\n", 170 | "988129/988129 [==============================] - 38s 38us/sample - loss: 3.9619\n", 171 | "Epoch 11/20\n", 172 | "988129/988129 [==============================] - 43s 43us/sample - loss: 3.9349\n", 173 | "Epoch 12/20\n", 174 | "988129/988129 [==============================] - 39s 39us/sample - loss: 3.9112\n", 175 | "Epoch 13/20\n", 176 | "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8902\n", 177 | "Epoch 14/20\n", 178 | "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8712\n", 179 | "Epoch 15/20\n", 180 | "988129/988129 [==============================] - 38s 38us/sample - loss: 3.8560\n", 181 | "Epoch 16/20\n", 182 | "988129/988129 [==============================] - 39s 40us/sample - loss: 3.8413\n", 183 | "Epoch 17/20\n", 184 | "988129/988129 [==============================] - 39s 39us/sample - loss: 3.8285\n", 185 | "Epoch 18/20\n", 186 | "988129/988129 [==============================] - 38s 38us/sample - loss: 3.8185\n", 187 | "Epoch 19/20\n", 188 | "988129/988129 [==============================] - 40s 40us/sample - loss: 3.8069\n", 189 | "Epoch 20/20\n", 190 | "988129/988129 [==============================] - 40s 41us/sample - loss: 3.7964\n", 191 | "WARNING:tensorflow:From /Users/swc/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_v1.py:2070: Model.state_updates (from tensorflow.python.keras.engine.training) is deprecated and will be removed in a future version.\n", 192 | "Instructions for updating:\n", 193 | "This property should not be used in TensorFlow 2.0, as updates are applied automatically.\n", 194 | "(6040, 32)\n", 195 | "(3706, 32)\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "#data = pd.read_csvdata = pd.read_csv(\"./movielens_sample.txt\")\n", 201 | "sparse_features = [\"movie_id\", \"user_id\",\n", 202 | " \"gender\", \"age\", \"occupation\", \"zip\", \"genres\"]\n", 203 | "SEQ_LEN = 50\n", 204 | "negsample = 0\n", 205 | "\n", 206 | "# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`\n", 207 | "\n", 208 | "feature_max_idx = {}\n", 209 | "for feature in sparse_features:\n", 210 | " lbe = LabelEncoder()\n", 211 | " data[feature] = lbe.fit_transform(data[feature]) + 1\n", 212 | " feature_max_idx[feature] = data[feature].max() + 1\n", 213 | "\n", 214 | "user_profile = data[[\"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]].drop_duplicates('user_id')\n", 215 | "\n", 216 | "item_profile = data[[\"movie_id\"]].drop_duplicates('movie_id')\n", 217 | "\n", 218 | "user_profile.set_index(\"user_id\", inplace=True)\n", 219 | "\n", 220 | "user_item_list = data.groupby(\"user_id\")['movie_id'].apply(list)\n", 221 | "\n", 222 | "train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)\n", 223 | "\n", 224 | "train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", 225 | "test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", 226 | "\n", 227 | "# 2.count #unique features for each sparse field and generate feature config for sequence feature\n", 228 | "\n", 229 | "embedding_dim = 32\n", 230 | "\n", 231 | "user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),\n", 232 | " SparseFeat(\"gender\", feature_max_idx['gender'], 16),\n", 233 | " SparseFeat(\"age\", feature_max_idx['age'], 16),\n", 234 | " SparseFeat(\"occupation\", feature_max_idx['occupation'], 16),\n", 235 | " SparseFeat(\"zip\", feature_max_idx['zip'], 16),\n", 236 | " VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,\n", 237 | " embedding_name=\"movie_id\"), SEQ_LEN, 'mean', 'hist_len'),\n", 238 | " VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,\n", 239 | " embedding_name=\"genres\"), SEQ_LEN, 'mean', 'hist_len'),\n", 240 | " ]\n", 241 | "\n", 242 | "item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]\n", 243 | "\n", 244 | "from collections import Counter\n", 245 | "train_counter = Counter(train_model_input['movie_id'])\n", 246 | "item_count = [train_counter.get(i,0) for i in range(item_feature_columns[0].vocabulary_size)]\n", 247 | "sampler_config = NegativeSampler('frequency',num_sampled=255,item_name=\"movie_id\",item_count=item_count)\n", 248 | "\n", 249 | "# 3.Define Model and train\n", 250 | "\n", 251 | "import tensorflow as tf\n", 252 | "if tf.__version__ >= '2.0.0':\n", 253 | " tf.compat.v1.disable_eager_execution()\n", 254 | "else:\n", 255 | " K.set_learning_phase(True)\n", 256 | " \n", 257 | "model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)\n", 258 | "#model = MIND(user_feature_columns,item_feature_columns,dynamic_k=False,k_max=2, user_dnn_hidden_units=(128,64, embedding_dim), sampler_config=sampler_config)\n", 259 | "\n", 260 | "model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss)\n", 261 | "\n", 262 | "history = model.fit(train_model_input, train_label, # train_label,\n", 263 | " batch_size=512, epochs=20, verbose=1, validation_split=0.0, )\n", 264 | "\n", 265 | "# 4. Generate user features for testing and full item features for retrieval\n", 266 | "test_user_model_input = test_model_input\n", 267 | "all_item_model_input = {\"movie_id\": item_profile['movie_id'].values,}\n", 268 | "\n", 269 | "user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", 270 | "item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", 271 | "\n", 272 | "user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", 273 | "# user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND\n", 274 | "item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", 275 | "\n", 276 | "print(user_embs.shape)\n", 277 | "print(item_embs.shape)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": { 283 | "id": "w_G3KWslKmJo" 284 | }, 285 | "source": [ 286 | "# 使用faiss进行ANN查找并评估结果" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "id": "5SvyQLNVKkcs" 293 | }, 294 | "source": [] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 4, 299 | "metadata": { 300 | "colab": { 301 | "base_uri": "https://localhost:8080/" 302 | }, 303 | "id": "j2ZNYNBOOqrN", 304 | "outputId": "2eec5e82-2d2b-4fe0-9b83-2a74a4dc52ba" 305 | }, 306 | "outputs": [ 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 312 | "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.7/dist-packages (1.7.2)\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "! pip install faiss-cpu" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 5, 323 | "metadata": { 324 | "colab": { 325 | "base_uri": "https://localhost:8080/" 326 | }, 327 | "id": "6TY1l27iJU8U", 328 | "outputId": "5a8ccdd3-af70-4c48-b859-84c4befddfdd" 329 | }, 330 | "outputs": [ 331 | { 332 | "name": "stderr", 333 | "output_type": "stream", 334 | "text": [ 335 | "6040it [00:02, 2769.01it/s]" 336 | ] 337 | }, 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "\n", 343 | "recall 0.33708609271523177\n", 344 | "hit rate 0.33708609271523177\n" 345 | ] 346 | }, 347 | { 348 | "name": "stderr", 349 | "output_type": "stream", 350 | "text": [ 351 | "\n" 352 | ] 353 | } 354 | ], 355 | "source": [ 356 | "test_true_label = {line[0]:[line[1]] for line in test_set}\n", 357 | "\n", 358 | "import numpy as np\n", 359 | "import faiss\n", 360 | "from tqdm import tqdm\n", 361 | "from deepmatch.utils import recall_N\n", 362 | "\n", 363 | "index = faiss.IndexFlatIP(embedding_dim)\n", 364 | "# faiss.normalize_L2(item_embs)\n", 365 | "index.add(item_embs)\n", 366 | "# faiss.normalize_L2(user_embs)\n", 367 | "D, I = index.search(np.ascontiguousarray(user_embs), 50)\n", 368 | "s = []\n", 369 | "hit = 0\n", 370 | "for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):\n", 371 | " try:\n", 372 | " pred = [item_profile['movie_id'].values[x] for x in I[i]]\n", 373 | " filter_item = None\n", 374 | " recall_score = recall_N(test_true_label[uid], pred, N=50)\n", 375 | " s.append(recall_score)\n", 376 | " if test_true_label[uid] in pred:\n", 377 | " hit += 1\n", 378 | " except:\n", 379 | " print(i)\n", 380 | "print(\"\")\n", 381 | "print(\"recall\", np.mean(s))\n", 382 | "print(\"hit rate\", hit / len(test_user_model_input['user_id']))" 383 | ] 384 | } 385 | ], 386 | "metadata": { 387 | "accelerator": "GPU", 388 | "colab": { 389 | "collapsed_sections": [], 390 | "name": "colab_MovieLen1M_YoutubeDNN.ipynb", 391 | "provenance": [] 392 | }, 393 | "gpuClass": "standard", 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.5" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 1 414 | } 415 | -------------------------------------------------------------------------------- /deepmatch/layers/interaction.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Author: 4 | Weichen Shen,weichenswc@163.com 5 | 6 | """ 7 | 8 | import tensorflow as tf 9 | from deepctr.layers.normalization import LayerNormalization 10 | from deepctr.layers.utils import softmax, reduce_mean 11 | from tensorflow.python.keras.initializers import TruncatedNormal 12 | from tensorflow.python.keras.layers import Layer, Dense, Dropout 13 | 14 | 15 | class DotAttention(Layer): 16 | """ 17 | :param query: [batch_size, 1, C] 18 | :param key: [batch_size, T, C] 19 | :return: [batch_size, 1, T] 20 | """ 21 | 22 | def __init__(self, scale=True, **kwargs): 23 | self.scale = scale 24 | super(DotAttention, self).__init__(**kwargs) 25 | 26 | def build(self, input_shape): 27 | if not isinstance(input_shape, list) or len(input_shape) != 2: 28 | raise ValueError('A `DotAttention` layer should be called ' 29 | 'on a list of 2 tensors') 30 | if input_shape[0][-1] != input_shape[1][-1]: 31 | raise ValueError('query_size should keep the same dim with key_size') 32 | super(DotAttention, self).build(input_shape) 33 | 34 | def call(self, inputs, mask=None, **kwargs): 35 | query, key = inputs 36 | output = tf.matmul(query, tf.transpose(key, [0, 2, 1])) 37 | if self.scale == True: 38 | output = output / (key.get_shape().as_list()[-1] ** 0.5) 39 | return output 40 | 41 | def compute_output_shape(self, input_shape): 42 | return (None, 1, input_shape[1][1]) 43 | 44 | def compute_mask(self, inputs, mask): 45 | return mask 46 | 47 | 48 | class ConcatAttention(Layer): 49 | """ 50 | :param query: [batch_size, T, C_q] 51 | :param key: [batch_size, T, C_k] 52 | :return: [batch_size, 1, T] 53 | query_size should keep the same dim with key_size 54 | """ 55 | 56 | def __init__(self, scale=True, **kwargs): 57 | self.scale = scale 58 | super(ConcatAttention, self).__init__(**kwargs) 59 | 60 | def build(self, input_shape): 61 | if not isinstance(input_shape, list) or len(input_shape) != 2: 62 | raise ValueError('A `ConcatAttention` layer should be called ' 63 | 'on a list of 2 tensors') 64 | self.projection_layer = Dense(units=1, activation='tanh') 65 | super(ConcatAttention, self).build(input_shape) 66 | 67 | def call(self, inputs, mask=None, **kwargs): 68 | query, key = inputs 69 | q_k = tf.concat([query, key], axis=-1) 70 | output = self.projection_layer(q_k) 71 | if self.scale == True: 72 | output = output / (key.get_shape().as_list()[-1] ** 0.5) 73 | output = tf.transpose(output, [0, 2, 1]) 74 | return output 75 | 76 | def compute_output_shape(self, input_shape): 77 | return (None, 1, input_shape[1][1]) 78 | 79 | def compute_mask(self, inputs, mask): 80 | return mask 81 | 82 | 83 | class SoftmaxWeightedSum(Layer): 84 | """ 85 | :param align: [batch_size, 1, T] 86 | :param value: [batch_size, T, units] 87 | :param key_masks: [batch_size, 1, T] 88 | 2nd dim size with align 89 | :param drop_out: 90 | :param future_binding: 91 | :return: weighted sum vector 92 | [batch_size, 1, units] 93 | """ 94 | 95 | def __init__(self, dropout_rate=0.2, future_binding=False, seed=2020, **kwargs): 96 | self.dropout_rate = dropout_rate 97 | self.future_binding = future_binding 98 | self.seed = seed 99 | super(SoftmaxWeightedSum, self).__init__(**kwargs) 100 | 101 | def build(self, input_shape): 102 | if not isinstance(input_shape, list) or len(input_shape) != 3: 103 | raise ValueError('A `SoftmaxWeightedSum` layer should be called ' 104 | 'on a list of 3 tensors') 105 | if input_shape[0][-1] != input_shape[2][-1]: 106 | raise ValueError('query_size should keep the same dim with key_mask_size') 107 | self.dropout = Dropout(self.dropout_rate, seed=self.seed) 108 | super(SoftmaxWeightedSum, self).build(input_shape) 109 | 110 | def call(self, inputs, mask=None, training=None, **kwargs): 111 | align, value, key_masks = inputs 112 | paddings = tf.ones_like(align) * (-2 ** 32 + 1) 113 | align = tf.where(key_masks, align, paddings) 114 | if self.future_binding: 115 | length = value.get_shape().as_list()[1] 116 | lower_tri = tf.ones([length, length]) 117 | try: 118 | lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense() 119 | except AttributeError: 120 | lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense() 121 | masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1]) 122 | align = tf.where(tf.equal(masks, 0), paddings, align) 123 | align = softmax(align) 124 | align = self.dropout(align, training=training) 125 | output = tf.matmul(align, value) 126 | return output 127 | 128 | def compute_output_shape(self, input_shape): 129 | return (None, 1, input_shape[1][1]) 130 | 131 | def get_config(self, ): 132 | config = {'dropout_rate': self.dropout_rate, 'future_binding': self.future_binding} 133 | base_config = super(SoftmaxWeightedSum, self).get_config() 134 | return dict(list(base_config.items()) + list(config.items())) 135 | 136 | def compute_mask(self, inputs, mask): 137 | return mask 138 | 139 | 140 | class AttentionSequencePoolingLayer(Layer): 141 | """ 142 | :param query: [batch_size, 1, C_q] 143 | :param keys: [batch_size, T, C_k] 144 | :param keys_length: [batch_size, 1] 145 | :return: [batch_size, 1, C_k] 146 | """ 147 | 148 | def __init__(self, dropout_rate=0, **kwargs): 149 | self.dropout_rate = dropout_rate 150 | super(AttentionSequencePoolingLayer, self).__init__(**kwargs) 151 | 152 | def build(self, input_shape): 153 | if not isinstance(input_shape, list) or len(input_shape) != 3: 154 | raise ValueError('A `SequenceFeatureMask` layer should be called ' 155 | 'on a list of 3 inputs') 156 | self.concat_att = ConcatAttention() 157 | self.softmax_weight_sum = SoftmaxWeightedSum(dropout_rate=self.dropout_rate, future_binding=False) 158 | super(AttentionSequencePoolingLayer, self).build(input_shape) 159 | 160 | def call(self, inputs, mask=None, **kwargs): 161 | queries, keys, keys_length = inputs 162 | hist_len = keys.get_shape()[1] 163 | key_masks = tf.sequence_mask(keys_length, hist_len) 164 | queries = tf.tile(queries, [1, hist_len, 1]) # [batch_size, T, units] 165 | attention_score = self.concat_att([queries, keys]) # [batch_size, 1, units] 166 | 167 | outputs = self.softmax_weight_sum([attention_score, keys, key_masks]) 168 | # [batch_size, units] 169 | return outputs 170 | 171 | def compute_output_shape(self, input_shape): 172 | return (None, 1, input_shape[1][1]) 173 | 174 | def get_config(self, ): 175 | config = {'dropout_rate': self.dropout_rate} 176 | base_config = super(AttentionSequencePoolingLayer, self).get_config() 177 | return dict(list(base_config.items()) + list(config.items())) 178 | 179 | def compute_mask(self, inputs, mask): 180 | return mask 181 | 182 | 183 | class SelfAttention(Layer): 184 | """ 185 | :param input: A 3d tensor with shape of [batch_size, 1, C] 186 | :param key_masks: A 3d tensor with shape of [batch_size, 1] 187 | :return: A 3d tensor with shape of [batch_size, 1] 188 | """ 189 | 190 | def __init__(self, scale=True, dropout_rate=0.2, future_binding=True, use_layer_norm=True, seed=2020, **kwargs): 191 | self.scale = scale 192 | self.dropout_rate = dropout_rate 193 | self.future_binding = future_binding 194 | self.use_layer_norm = use_layer_norm 195 | self.seed = seed 196 | super(SelfAttention, self).__init__(**kwargs) 197 | 198 | def build(self, input_shape): 199 | if not isinstance(input_shape, list) or len(input_shape) != 2: 200 | raise ValueError('A `SelfAttention` layer should be called ' 201 | 'on a list of 2 tensors') 202 | self.layer_norm = LayerNormalization() 203 | self.attention = DotAttention(scale=self.scale) 204 | self.softmax_weight_sum = SoftmaxWeightedSum(dropout_rate=self.dropout_rate, future_binding=self.future_binding, 205 | seed=self.seed) 206 | super(SelfAttention, self).build(input_shape) 207 | 208 | def call(self, inputs, mask=None, **kwargs): 209 | _input, key_masks = inputs 210 | querys, keys, values = _input, _input, _input 211 | align = self.attention([querys, keys]) 212 | output = self.softmax_weight_sum([align, values, key_masks]) 213 | if self.use_layer_norm: 214 | output = self.layer_norm(output) 215 | return reduce_mean(output, 1, keep_dims=True) 216 | 217 | def compute_output_shape(self, input_shape): 218 | return (None, 1, input_shape[0][-1]) 219 | 220 | def compute_mask(self, inputs, mask): 221 | return mask 222 | 223 | 224 | class SelfMultiHeadAttention(Layer): 225 | """ 226 | :param query: A 3d tensor with shape of [batch_size, T, C] 227 | :param key_masks: A 3d tensor with shape of [batch_size, 1] 228 | :return: A 3d tensor with shape of [batch_size, T, C] 229 | """ 230 | 231 | def __init__(self, num_units=8, head_num=4, scale=True, dropout_rate=0.2, future_binding=True, use_layer_norm=True, 232 | use_res=True, 233 | seed=2020, **kwargs): 234 | if head_num <= 0: 235 | raise ValueError('head_num must be a int > 0') 236 | self.num_units = num_units 237 | self.head_num = head_num 238 | self.scale = scale 239 | self.dropout_rate = dropout_rate 240 | self.future_binding = future_binding 241 | self.use_layer_norm = use_layer_norm 242 | self.use_res = use_res 243 | self.seed = seed 244 | super(SelfMultiHeadAttention, self).__init__(**kwargs) 245 | 246 | def build(self, input_shape): 247 | if not isinstance(input_shape, list) or len(input_shape) != 2: 248 | raise ValueError('A `SelfMultiHeadAttention` layer should be called ' 249 | 'on a list of 2 tensors') 250 | if len(input_shape[0]) != 3 or len(input_shape[1]) != 2: 251 | raise ValueError('input: [N, T_k, d_model], key masks: [N, key_seqlen]') 252 | embedding_size = int(input_shape[0][-1]) 253 | if self.num_units == None: 254 | self.num_units = embedding_size 255 | self.W = self.add_weight(name='Q_K_V', shape=[embedding_size, self.num_units * 3], 256 | dtype=tf.float32, 257 | initializer=TruncatedNormal(seed=self.seed)) 258 | self.W_output = self.add_weight(name='output_W', shape=[self.num_units, self.num_units], 259 | dtype=tf.float32, 260 | initializer=TruncatedNormal(seed=self.seed)) 261 | 262 | self.layer_norm = LayerNormalization() 263 | self.attention = DotAttention(scale=self.scale) 264 | self.softmax_weight_sum = SoftmaxWeightedSum(dropout_rate=self.dropout_rate, future_binding=self.future_binding, 265 | seed=self.seed) 266 | self.dropout = Dropout(self.dropout_rate, seed=self.seed) 267 | self.seq_len_max = int(input_shape[0][1]) 268 | # Be sure to call this somewhere! 269 | super(SelfMultiHeadAttention, self).build(input_shape) 270 | 271 | def call(self, inputs, mask=None, training=None, **kwargs): 272 | input_info, keys_length = inputs 273 | 274 | hist_len = input_info.get_shape()[1] 275 | key_masks = tf.sequence_mask(keys_length, hist_len) 276 | key_masks = tf.squeeze(key_masks, axis=1) 277 | 278 | Q_K_V = tf.tensordot(input_info, self.W, axes=(-1, 0)) # [N T_q D*3] 279 | querys, keys, values = tf.split(Q_K_V, 3, -1) 280 | 281 | # head_num None F D 282 | querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0) # (h*N, T_q, C/h) 283 | keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0) # (h*N, T_k, C/h) 284 | values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0) # (h*N, T_k, C/h) 285 | 286 | # (h*N, T_q, T_k) 287 | align = self.attention([querys, keys]) 288 | 289 | key_masks = tf.tile(key_masks, [self.head_num, 1]) # (h*N, T_k) 290 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(input_info)[1], 1]) # (h*N, T_q, T_k) 291 | 292 | outputs = self.softmax_weight_sum([align, values, key_masks]) # (h*N, T_q, C/h) 293 | outputs = tf.concat(tf.split(outputs, self.head_num, axis=0), axis=2) # (N, T_q, C) 294 | 295 | outputs = tf.tensordot(outputs, self.W_output, axes=(-1, 0)) # (N, T_q, C) 296 | outputs = self.dropout(outputs, training=training) 297 | if self.use_res: 298 | outputs += input_info 299 | if self.use_layer_norm: 300 | outputs = self.layer_norm(outputs) 301 | 302 | return outputs 303 | 304 | def compute_output_shape(self, input_shape): 305 | return (None, input_shape[0][1], self.num_units) 306 | 307 | def get_config(self, ): 308 | config = {'num_units': self.num_units, 'head_num': self.head_num, 'scale': self.scale, 309 | 'dropout_rate': self.dropout_rate, 310 | 'future_binding': self.future_binding, 'use_layer_norm': self.use_layer_norm, 'use_res': self.use_res, 311 | 'seed': self.seed} 312 | base_config = super(SelfMultiHeadAttention, self).get_config() 313 | return dict(list(base_config.items()) + list(config.items())) 314 | 315 | def compute_mask(self, inputs, mask): 316 | return mask 317 | 318 | 319 | class UserAttention(Layer): 320 | """ 321 | :param query: A 3d tensor with shape of [batch_size, T, C] 322 | :param keys: A 3d tensor with shape of [batch_size, T, C] 323 | :param key_masks: A 3d tensor with shape of [batch_size, 1] 324 | :return: A 3d tensor with shape of [batch_size, 1, C] 325 | """ 326 | 327 | def __init__(self, num_units=None, activation='tanh', use_res=True, dropout_rate=0, scale=True, seed=2020, 328 | **kwargs): 329 | self.scale = scale 330 | self.num_units = num_units 331 | self.activation = activation 332 | self.dropout_rate = dropout_rate 333 | self.seed = seed 334 | self.use_res = use_res 335 | super(UserAttention, self).__init__(**kwargs) 336 | 337 | def build(self, input_shape): 338 | if not isinstance(input_shape, list) or len(input_shape) != 3: 339 | raise ValueError('A `UserAttention` layer should be called ' 340 | 'on a list of 3 tensors') 341 | if self.num_units == None: 342 | self.num_units = input_shape[0][-1] 343 | self.dense = Dense(self.num_units, activation=self.activation) 344 | self.attention = DotAttention(scale=self.scale) 345 | self.softmax_weight_sum = SoftmaxWeightedSum(dropout_rate=self.dropout_rate, seed=self.seed) 346 | super(UserAttention, self).build(input_shape) 347 | 348 | def call(self, inputs, mask=None, **kwargs): 349 | user_query, keys, keys_length = inputs 350 | hist_len = keys.get_shape()[1] 351 | key_masks = tf.sequence_mask(keys_length, hist_len) 352 | query = self.dense(user_query) 353 | 354 | align = self.attention([query, keys]) 355 | 356 | output = self.softmax_weight_sum([align, keys, key_masks]) 357 | 358 | if self.use_res: 359 | output += keys 360 | return reduce_mean(output, 1, keep_dims=True) 361 | 362 | def compute_output_shape(self, input_shape): 363 | return (None, 1, input_shape[1][2]) 364 | 365 | def compute_mask(self, inputs, mask): 366 | return mask 367 | 368 | def get_config(self, ): 369 | config = {'num_units': self.num_units, 'activation': self.activation, 'use_res': self.use_res, 370 | 'dropout_rate': self.dropout_rate, 371 | 'scale': self.scale, 'seed': self.seed, } 372 | base_config = super(UserAttention, self).get_config() 373 | return dict(list(base_config.items()) + list(config.items())) 374 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import inspect 4 | import numpy as np 5 | import os 6 | import sys 7 | import tensorflow as tf 8 | from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, DEFAULT_GROUP_NAME 9 | from deepmatch.layers import custom_objects 10 | from numpy.testing import assert_allclose 11 | from tensorflow.python.keras import backend as K 12 | from tensorflow.python.keras.layers import Input, Masking 13 | from tensorflow.python.keras.models import Model, load_model, save_model 14 | 15 | SAMPLE_SIZE = 8 16 | VOCABULARY_SIZE = 4 17 | 18 | 19 | def gen_sequence(dim, max_len, sample_size): 20 | return np.array([np.random.randint(0, dim, max_len) for _ in range(sample_size)]), np.random.randint(1, max_len + 1, 21 | sample_size) 22 | 23 | 24 | def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, 25 | sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, 26 | hash_flag=False, prefix='', use_group=False): 27 | feature_columns = [] 28 | model_input = {} 29 | 30 | if 'weight' in sequence_feature: 31 | feature_columns.append( 32 | VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), 33 | maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", 34 | weight_name=prefix + "weight")) 35 | s_input, s_len_input = gen_sequence( 36 | 2, 3, sample_size) 37 | 38 | model_input[prefix + "weighted_seq"] = s_input 39 | model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) 40 | model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input 41 | sequence_feature.pop(sequence_feature.index('weight')) 42 | 43 | for i in range(sparse_feature_num): 44 | if use_group: 45 | group_name = str(i % 3) 46 | else: 47 | group_name = DEFAULT_GROUP_NAME 48 | dim = np.random.randint(1, 10) 49 | feature_columns.append( 50 | SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32, 51 | group_name=group_name)) 52 | 53 | for i in range(dense_feature_num): 54 | feature_columns.append(DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) 55 | for i, mode in enumerate(sequence_feature): 56 | dim = np.random.randint(1, 10) 57 | maxlen = np.random.randint(1, 10) 58 | feature_columns.append( 59 | VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), 60 | maxlen=maxlen, combiner=mode)) 61 | 62 | for fc in feature_columns: 63 | if isinstance(fc, SparseFeat): 64 | model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) 65 | elif isinstance(fc, DenseFeat): 66 | model_input[fc.name] = np.random.random(sample_size) 67 | else: 68 | s_input, s_len_input = gen_sequence( 69 | fc.vocabulary_size, fc.maxlen, sample_size) 70 | model_input[fc.name] = s_input 71 | if include_length: 72 | fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' 73 | model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input 74 | 75 | if classification: 76 | y = np.random.randint(0, 2, sample_size) 77 | else: 78 | y = np.random.random(sample_size) 79 | 80 | return model_input, y, feature_columns 81 | 82 | 83 | def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None, 84 | 85 | input_data=None, expected_output=None, 86 | 87 | expected_output_dtype=None, fixed_batch_size=False, supports_masking=False): 88 | # generate input data 89 | 90 | if input_data is None: 91 | 92 | if not input_shape: 93 | raise AssertionError() 94 | 95 | if not input_dtype: 96 | input_dtype = K.floatx() 97 | 98 | input_data_shape = list(input_shape) 99 | 100 | for i, e in enumerate(input_data_shape): 101 | 102 | if e is None: 103 | input_data_shape[i] = np.random.randint(1, 4) 104 | input_mask = [] 105 | if all(isinstance(e, tuple) for e in input_data_shape): 106 | input_data = [] 107 | 108 | for e in input_data_shape: 109 | input_data.append( 110 | (10 * np.random.random(e)).astype(input_dtype)) 111 | if supports_masking: 112 | a = np.full(e[:2], False) 113 | a[:, :e[1] // 2] = True 114 | input_mask.append(a) 115 | 116 | else: 117 | 118 | input_data = (10 * np.random.random(input_data_shape)) 119 | 120 | input_data = input_data.astype(input_dtype) 121 | if supports_masking: 122 | a = np.full(input_data_shape[:2], False) 123 | a[:, :input_data_shape[1] // 2] = True 124 | 125 | print(a) 126 | print(a.shape) 127 | input_mask.append(a) 128 | 129 | else: 130 | 131 | if input_shape is None: 132 | input_shape = input_data.shape 133 | 134 | if input_dtype is None: 135 | input_dtype = input_data.dtype 136 | 137 | if expected_output_dtype is None: 138 | expected_output_dtype = input_dtype 139 | 140 | # instantiation 141 | 142 | layer = layer_cls(**kwargs) 143 | 144 | # test get_weights , set_weights at layer level 145 | 146 | weights = layer.get_weights() 147 | 148 | layer.set_weights(weights) 149 | 150 | try: 151 | expected_output_shape = layer.compute_output_shape(input_shape) 152 | except Exception: 153 | expected_output_shape = layer._compute_output_shape(input_shape) 154 | 155 | # test in functional API 156 | if isinstance(input_shape, list): 157 | if fixed_batch_size: 158 | 159 | x = [Input(batch_shape=e, dtype=input_dtype) for e in input_shape] 160 | if supports_masking: 161 | mask = [Input(batch_shape=e[0:2], dtype=bool) 162 | for e in input_shape] 163 | 164 | else: 165 | 166 | x = [Input(shape=e[1:], dtype=input_dtype) for e in input_shape] 167 | if supports_masking: 168 | mask = [Input(shape=(e[1],), dtype=bool) for e in input_shape] 169 | 170 | else: 171 | if fixed_batch_size: 172 | 173 | x = Input(batch_shape=input_shape, dtype=input_dtype) 174 | if supports_masking: 175 | mask = Input(batch_shape=input_shape[0:2], dtype=bool) 176 | 177 | else: 178 | 179 | x = Input(shape=input_shape[1:], dtype=input_dtype) 180 | if supports_masking: 181 | mask = Input(shape=(input_shape[1],), dtype=bool) 182 | 183 | if supports_masking: 184 | 185 | y = layer(Masking()(x), mask=mask) 186 | else: 187 | y = layer(x) 188 | 189 | if not (K.dtype(y) == expected_output_dtype): 190 | raise AssertionError() 191 | 192 | # check with the functional API 193 | if supports_masking: 194 | model = Model([x, mask], y) 195 | 196 | actual_output = model.predict([input_data, input_mask[0]]) 197 | else: 198 | model = Model(x, y) 199 | 200 | actual_output = model.predict(input_data) 201 | 202 | actual_output_shape = actual_output.shape 203 | for expected_dim, actual_dim in zip(expected_output_shape, 204 | 205 | actual_output_shape): 206 | 207 | if expected_dim is not None: 208 | 209 | if not (expected_dim == actual_dim): 210 | raise AssertionError("expected_shape", expected_output_shape, "actual_shape", actual_output_shape) 211 | 212 | if expected_output is not None: 213 | assert_allclose(actual_output, expected_output, rtol=1e-3) 214 | 215 | # test serialization, weight setting at model level 216 | 217 | model_config = model.get_config() 218 | 219 | recovered_model = model.__class__.from_config(model_config) 220 | 221 | if model.weights: 222 | weights = model.get_weights() 223 | 224 | recovered_model.set_weights(weights) 225 | 226 | _output = recovered_model.predict(input_data) 227 | 228 | assert_allclose(_output, actual_output, rtol=1e-3) 229 | 230 | # test training mode (e.g. useful when the layer has a 231 | 232 | # different behavior at training and testing time). 233 | 234 | if has_arg(layer.call, 'training'): 235 | model.compile('rmsprop', 'mse') 236 | 237 | model.train_on_batch(input_data, actual_output) 238 | 239 | # test instantiation from layer config 240 | 241 | layer_config = layer.get_config() 242 | 243 | layer_config['batch_input_shape'] = input_shape 244 | 245 | layer = layer.__class__.from_config(layer_config) 246 | 247 | # for further checks in the caller function 248 | 249 | return actual_output 250 | 251 | 252 | def has_arg(fn, name, accept_all=False): 253 | """Checks if a callable accepts a given keyword argument. 254 | 255 | 256 | 257 | For Python 2, checks if there is an argument with the given name. 258 | 259 | 260 | 261 | For Python 3, checks if there is an argument with the given name, and 262 | 263 | also whether this argument can be called with a keyword (i.e. if it is 264 | 265 | not a positional-only argument). 266 | 267 | 268 | 269 | # Arguments 270 | 271 | fn: Callable to inspect. 272 | 273 | name: Check if `fn` can be called with `name` as a keyword argument. 274 | 275 | accept_all: What to return if there is no parameter called `name` 276 | 277 | but the function accepts a `**kwargs` argument. 278 | 279 | 280 | 281 | # Returns 282 | 283 | bool, whether `fn` accepts a `name` keyword argument. 284 | 285 | """ 286 | 287 | if sys.version_info < (3,): 288 | 289 | arg_spec = inspect.getargspec(fn) 290 | 291 | if accept_all and arg_spec.keywords is not None: 292 | return True 293 | 294 | return (name in arg_spec.args) 295 | 296 | elif sys.version_info < (3, 3): 297 | 298 | arg_spec = inspect.getfullargspec(fn) 299 | 300 | if accept_all and arg_spec.varkw is not None: 301 | return True 302 | 303 | return (name in arg_spec.args or 304 | 305 | name in arg_spec.kwonlyargs) 306 | 307 | else: 308 | 309 | signature = inspect.signature(fn) 310 | 311 | parameter = signature.parameters.get(name) 312 | 313 | if parameter is None: 314 | 315 | if accept_all: 316 | 317 | for param in signature.parameters.values(): 318 | 319 | if param.kind == inspect.Parameter.VAR_KEYWORD: 320 | return True 321 | 322 | return False 323 | 324 | return (parameter.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, 325 | 326 | inspect.Parameter.KEYWORD_ONLY)) 327 | 328 | 329 | def check_model(model, model_name, x, y, check_model_io=True): 330 | """ 331 | compile model,train and evaluate it,then save/load weight and model file. 332 | :param model: 333 | :param model_name: 334 | :param x: 335 | :param y: 336 | :param check_model_io: test save/load model file or not 337 | :return: 338 | """ 339 | 340 | model.fit(x, y, batch_size=10, epochs=2, validation_split=0.5) 341 | 342 | print(model_name + " test train valid pass!") 343 | 344 | user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) 345 | item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) 346 | 347 | _ = user_embedding_model.predict(x, batch_size=2 ** 12) 348 | # user_embs = user_embs[:, i, :] i in [0,k_max) if MIND 349 | print(model_name + " user_emb pass!") 350 | _ = item_embedding_model.predict(x, batch_size=2 ** 12) 351 | 352 | print(model_name + " item_emb pass!") 353 | 354 | model.save_weights(model_name + '_weights.h5') 355 | model.load_weights(model_name + '_weights.h5') 356 | os.remove(model_name + '_weights.h5') 357 | print(model_name + " test save load weight pass!") 358 | if check_model_io: 359 | save_model(model, model_name + '.h5') 360 | model = load_model(model_name + '.h5', custom_objects) 361 | os.remove(model_name + '.h5') 362 | print(model_name + " test save load model pass!") 363 | 364 | print(model_name + " test pass!") 365 | # print(1) 366 | # 367 | # save_model(item_embedding_model, model_name + '.user.h5') 368 | # print(2) 369 | # 370 | # item_embedding_model = load_model(model_name + '.user.h5', custom_objects) 371 | # print(3) 372 | # 373 | # item_embs = item_embedding_model.predict(x, batch_size=2 ** 12) 374 | # print(item_embs) 375 | # print("go") 376 | 377 | 378 | def get_xy_fd(hash_flag=False): 379 | user_feature_columns = [SparseFeat('user', 3), SparseFeat( 380 | 'gender', 2), VarLenSparseFeat( 381 | SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=4, embedding_name='item'), maxlen=4, 382 | length_name="hist_len")] 383 | item_feature_columns = [SparseFeat('item', 3 + 1, embedding_dim=4, )] 384 | 385 | uid = np.array([0, 1, 2, 1]) 386 | ugender = np.array([0, 1, 0, 1]) 387 | iid = np.array([1, 2, 3, 1]) # 0 is mask value 388 | 389 | hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) 390 | hist_len = np.array([3, 3, 2, 1]) 391 | 392 | feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 393 | 'hist_item': hist_iid, "hist_len": hist_len} 394 | 395 | # feature_names = get_feature_names(feature_columns) 396 | x = feature_dict 397 | y = np.array([1, 1, 1, 1]) 398 | return x, y, user_feature_columns, item_feature_columns 399 | 400 | 401 | def get_xy_fd_ncf(hash_flag=False): 402 | user_feature_columns = {"user": 3, "gender": 2, } 403 | item_feature_columns = {"item": 4} 404 | 405 | uid = np.array([0, 1, 2, 1]) 406 | ugender = np.array([0, 1, 0, 1]) 407 | iid = np.array([1, 2, 3, 1]) # 0 is mask value 408 | 409 | hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) 410 | hist_len = np.array([3, 3, 2, 1]) 411 | 412 | feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 413 | 'hist_item': hist_iid, "hist_len": hist_len} 414 | 415 | # feature_names = get_feature_names(feature_columns) 416 | x = feature_dict 417 | y = np.array([1, 1, 1, 1]) 418 | return x, y, user_feature_columns, item_feature_columns 419 | 420 | 421 | def get_xy_fd_sdm(hash_flag=False): 422 | user_feature_columns = [SparseFeat('user', 3), 423 | SparseFeat('gender', 2), 424 | VarLenSparseFeat(SparseFeat('prefer_item', vocabulary_size=100, embedding_dim=8, 425 | embedding_name='item'), maxlen=6, 426 | length_name="prefer_sess_length"), 427 | VarLenSparseFeat(SparseFeat('prefer_cate', vocabulary_size=100, embedding_dim=8, 428 | embedding_name='cate'), maxlen=6, 429 | length_name="prefer_sess_length"), 430 | VarLenSparseFeat(SparseFeat('short_item', vocabulary_size=100, embedding_dim=8, 431 | embedding_name='item'), maxlen=4, 432 | length_name="short_sess_length"), 433 | VarLenSparseFeat(SparseFeat('short_cate', vocabulary_size=100, embedding_dim=8, 434 | embedding_name='cate'), maxlen=4, 435 | length_name="short_sess_length"), 436 | ] 437 | item_feature_columns = [SparseFeat('item', 100, embedding_dim=8, )] 438 | 439 | uid = np.array([0, 1, 2, 1]) 440 | ugender = np.array([0, 1, 0, 1]) 441 | iid = np.array([1, 2, 3, 1]) # 0 is mask value 442 | 443 | prefer_iid = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) 444 | prefer_cate = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) 445 | short_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) 446 | short_cate = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) 447 | prefer_len = np.array([6, 5, 4, 3]) 448 | short_len = np.array([3, 3, 2, 1]) 449 | 450 | feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'prefer_item': prefer_iid, "prefer_cate": prefer_cate, 451 | 'short_item': short_iid, 'short_cate': short_cate, 'prefer_sess_length': prefer_len, 452 | 'short_sess_length': short_len} 453 | 454 | # feature_names = get_feature_names(feature_columns) 455 | x = feature_dict 456 | y = np.array([1, 1, 1, 0]) 457 | history_feature_list = ['item', 'cate'] 458 | 459 | return x, y, user_feature_columns, item_feature_columns, history_feature_list 460 | --------------------------------------------------------------------------------