├── models ├── __init__.py ├── taster_exceptions.py └── data_models.py ├── utils ├── __init__.py ├── config_utils.py ├── db_utils.py ├── cloudmusic_dao.py ├── encrypt_utils.py ├── logger_utils.py └── cloudmusic_api.py ├── api_server ├── __init__.py ├── flask_app.py ├── static │ └── js │ │ └── music_taster.js └── templates │ └── demo.html ├── pipelines ├── __init__.py ├── fetch_data.py ├── update_infos.py └── fetch_user_data.py ├── .gitignore ├── requirements.txt ├── song2vec ├── __init__.py ├── song2vec_operator.py └── rock_gensim.py ├── LICENSE └── readme.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /api_server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.json 3 | log/ 4 | *.log 5 | .idea/ 6 | datas/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pyopenssl 3 | pycrypto 4 | gensim==0.12.4 5 | sklearn -------------------------------------------------------------------------------- /song2vec/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 17/2/16. 5 | """ -------------------------------------------------------------------------------- /pipelines/fetch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import sys 4 | 5 | abs_path = os.path.dirname(os.path.abspath(__file__)) 6 | abs_father_path = os.path.dirname(abs_path) 7 | PROJECT_PATH = abs_father_path 8 | print 'Used file: %s\n project path=%s' % (__file__, PROJECT_PATH) 9 | sys.path.append(PROJECT_PATH) 10 | -------------------------------------------------------------------------------- /models/taster_exceptions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 16/12/22. 5 | """ 6 | 7 | 8 | class NonDataException(IOError): 9 | """ 10 | 无法获取到数据时的异常 11 | """ 12 | 13 | def __init__(self, msg): 14 | self.message = msg 15 | 16 | def __str__(self): 17 | return self.message 18 | -------------------------------------------------------------------------------- /utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import sys 5 | 6 | abs_path = os.path.dirname(os.path.abspath(__file__)) 7 | abs_father_path = os.path.dirname(abs_path) 8 | PROJECT_PATH = abs_father_path 9 | print 'Used file: %s\n project path=%s' % (__file__, PROJECT_PATH) 10 | sys.path.append(PROJECT_PATH) 11 | 12 | 13 | # config_info = {} 14 | 15 | 16 | def get_config(): 17 | with open('%s/config.json' % PROJECT_PATH, 'r') as fin: 18 | config_info = json.loads(fin.read()) 19 | return config_info 20 | 21 | 22 | def get_db_config(): 23 | with open('%s/db_config.json' % PROJECT_PATH, 'r') as fin: 24 | db_config_info = json.loads(fin.read()) 25 | return db_config_info 26 | -------------------------------------------------------------------------------- /utils/db_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pymongo 3 | 4 | from utils.config_utils import get_db_config 5 | 6 | __author__ = 'jayvee' 7 | 8 | db_config = get_db_config() 9 | DB_IP = db_config['db_ip'] 10 | DB_PORT = db_config['db_port'] 11 | 12 | 13 | def get_db_inst(db_name, collection_name): 14 | """ 15 | get mongoDB instance by db name and collection name 16 | Args: 17 | db_name: 18 | collection_name: 19 | 20 | Returns: 21 | db instance 22 | 23 | """ 24 | client = pymongo.MongoClient(DB_IP, DB_PORT) 25 | try: 26 | db_inst = client.get_database(db_name).get_collection(collection_name) 27 | return db_inst 28 | except Exception, e: 29 | print 'error, details=%s' % (e) 30 | 31 | 32 | def create_index(db_name, collection_name, index_conf): 33 | db_inst = get_db_inst(db_name, collection_name) 34 | print db_inst.create_indexes(index_conf) 35 | 36 | 37 | def find_all(find_filter, db_inst, sort_filter=None): 38 | MAX_COUNT = db_inst.find(find_filter).count() 39 | 40 | if not sort_filter: 41 | db_inst.find(find_filter) 42 | 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jayvee He 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Music Taster 2 | 顾名思义,这个项目用来挖掘音乐风格 3 | 大部分是基于歌单进行的关系挖掘,暂不涉及音频分析 4 | 5 | ### 动机 6 | 网易云音乐的红心歌单曲目太多了,想做点归类。 7 | 8 | ### 包含功能 9 | 1. 实现了歌单、歌曲详情的爬取与存储 10 | 2. 实现了Song2Vec、Artist2Vec 11 | 3. 实现歌曲、歌手的风格近似计算 12 | 4. 实现歌单下歌曲、歌手的聚类 13 | 5. 附带一个基于flask的API webserver 14 | 15 | ## 目录结构 16 | 1. models——用于存储数据类型对象的相关类文件 17 | 2. utils——基本的工具类 18 | 3. pipelines——存放各工作流程的脚本 19 | 4. datas——存放训练后的模型数据文件 20 | 5. api_server——基于flask的api server 21 | 22 | ## 环境需求 23 | 1. 如果进行数据爬取,则需要一个MongoDB实例进行数据管理 24 | 2. 安装`requirements.txt`下的依赖包 25 | 26 | ## Demo 27 | 28 | [http://api.jayveehe.com/musictaster](http://api.jayveehe.com/musictaster) 29 | 30 | ## Data 31 | Google Drive 32 | - [Artists Seq Data](https://drive.google.com/file/d/1fO4BkXBB9Rf5DsF7kggr6lROA4gmAB3Z/view?usp=sharing) 33 | - [Songs Seq Data](https://drive.google.com/file/d/1_kwmQ87kz3kHIRcAUdFaXY0_x2KCMyBw/view?usp=sharing) 34 | - [Artists x Songs Seq Data](https://drive.google.com/file/d/1IHetYu7Lrd_6jVurmq3_0oZ-OalEk5w2/view?usp=sharing) 35 | 36 | 使用方法: 37 | 1. 下载对应的dat文件 38 | 2. cPickle.load() 39 | 40 | ## Demo API Doc 41 | [https://github.com/JayveeHe/MusicTaster/wiki/Music-Taster-Demo-API-Doc](https://github.com/JayveeHe/MusicTaster/wiki/Music-Taster-Demo-API-Doc) 42 | 43 | ## 使用MIT License 44 | [MIT License](https://github.com/JayveeHe/MusicTaster/blob/master/LICENSE) 45 | 46 | -------------------------------------------------------------------------------- /utils/cloudmusic_dao.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | 负责进行云音乐的数据库相关操作 5 | 6 | Created by jayvee on 16/12/24. 7 | """ 8 | from utils.db_utils import get_db_inst 9 | from utils.logger_utils import data_process_logger 10 | 11 | 12 | class CloudMusicDAO: 13 | def __init__(self, db_name, collection_name): 14 | self.db_name = db_name 15 | self.collection_name = collection_name 16 | self.db_inst = get_db_inst(self.db_name, self.collection_name) 17 | 18 | def save_unique_item(self, data_obj, primary_key='userId', is_overwrite=False, is_inform=False): 19 | """ 20 | 存储数据对象,并避免重复存储 21 | Args: 22 | data_obj: 23 | primary_key: 24 | is_overwrite: 25 | 26 | Returns: 27 | 28 | """ 29 | find_result = self.db_inst.find_one({primary_key: data_obj[primary_key]}, {primary_key: 1}) 30 | # is_exist = user_dbinst.find({'userId': userinfo['userId']}).count() != 0 31 | # print find_result.count() 32 | 33 | if not find_result: 34 | self.db_inst.insert(data_obj) 35 | elif is_overwrite: 36 | self.db_inst.update({primary_key: data_obj[primary_key]}, data_obj) 37 | if is_inform: 38 | data_process_logger.warn( 39 | 'overwrite item %s in %s' % (data_obj[primary_key], self.collection_name)) 40 | else: 41 | if is_inform: 42 | data_process_logger.warn( 43 | 'Item %s exist! in %s' % (data_obj[primary_key], self.collection_name)) 44 | -------------------------------------------------------------------------------- /utils/encrypt_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import os 4 | 5 | from Crypto.Cipher import AES 6 | 7 | modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' 8 | nonce = '0CoJUm6Qyw8W8jud' 9 | pubKey = '010001' 10 | 11 | 12 | def createSecretKey(size): 13 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:16] 14 | 15 | 16 | def aesEncrypt(text, secKey): 17 | pad = 16 - len(text) % 16 18 | text = text + pad * chr(pad) 19 | encryptor = AES.new(secKey, 2, '0102030405060708') 20 | ciphertext = encryptor.encrypt(text) 21 | ciphertext = base64.b64encode(ciphertext) 22 | return ciphertext 23 | 24 | 25 | def rsaEncrypt(text, pubKey, modulus): 26 | text = text[::-1] 27 | rs = int(text.encode('hex'), 16) ** int(pubKey, 16) % int(modulus, 16) 28 | return format(rs, 'x').zfill(256) 29 | 30 | 31 | def encrypted_request(text): 32 | text = json.dumps(text) 33 | secKey = createSecretKey(16) 34 | encText = aesEncrypt(aesEncrypt(text, nonce), secKey) 35 | encSecKey = rsaEncrypt(secKey, pubKey, modulus) 36 | data = { 37 | 'params': encText, 38 | 'encSecKey': encSecKey 39 | } 40 | return data 41 | 42 | 43 | def test(): 44 | print encrypted_request( 45 | 'htPNp6MUYqXd/c2YsIovhhmQnn/5Y62aIPK6CTyRDLAA8okWvdwz6UC58AC2pe+tk6A9B9DgEG9H6m9Yt7mzRQyB3nWjvdvrXSiUxxY0BzMzilEP+2RO7LToRLfPpLRY9Y7+/YuO/9iIoPFzArhn8pnAS+r5UbQ7wZTWQ6iUd1cfu+A557dS5w2GmybQuXOYq8BFva9j3vj/4Cy4k3s4fmnj4z2XrZmFZn9Ngdy2ppY=') 46 | 47 | 48 | if __name__ == '__main__': 49 | test() 50 | -------------------------------------------------------------------------------- /pipelines/update_infos.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 17/2/14. 5 | """ 6 | import random 7 | 8 | import time 9 | 10 | from utils.cloudmusic_api import song_comments 11 | from utils.cloudmusic_dao import CloudMusicDAO 12 | from utils.logger_utils import data_process_logger 13 | 14 | 15 | def update_userinfo(): 16 | """ 17 | 临时更新数据库的脚本 18 | Returns: 19 | 20 | """ 21 | DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos') 22 | uids = DAO_inst.db_inst.distinct('userId') 23 | count = 0 24 | for uid in uids: 25 | userinfo = DAO_inst.db_inst.find_one({'userId': uid}) 26 | userinfo['follow_count'] = len(userinfo['follow_ids']) 27 | userinfo['fan_count'] = len(userinfo['fan_ids']) 28 | DAO_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True) 29 | data_process_logger.info('No.%s %s-%s' % (count, userinfo['userId'], userinfo['nickname'])) 30 | count += 1 31 | print 'done' 32 | 33 | 34 | def fill_song_comments(): 35 | """ 36 | 填充歌曲的评论详情 37 | Returns: 38 | 39 | """ 40 | dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') 41 | find_result = dao_inst.db_inst.find({'commentInfo': {'$exists': False}}) 42 | count = 0 43 | for song_item in find_result: 44 | comm_data = song_comments(song_item['commentThreadId'], limit=10) 45 | if comm_data: # 确保评论详情读取正确 46 | del comm_data['code'] 47 | # del comm_data['userId'] 48 | song_item['commentInfo'] = comm_data 49 | song_item['commentCount'] = comm_data['total'] 50 | dao_inst.db_inst.save(song_item) 51 | data_process_logger.info( 52 | 'No.%s %s, comments: %s done' % (count, song_item['name'], song_item['commentCount'])) 53 | count += 1 54 | slp = random.random() * 2 + 1 55 | data_process_logger.info('sleep %s sec' % slp) 56 | time.sleep(slp) 57 | 58 | 59 | if __name__ == '__main__': 60 | while 1: 61 | try: 62 | fill_song_comments() 63 | except Exception, e: 64 | print 'error %s' % e 65 | continue 66 | -------------------------------------------------------------------------------- /models/data_models.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from utils.cloudmusic_api import user_playlist, user_profile, song_detail 3 | from utils.logger_utils import data_process_logger 4 | 5 | """ 6 | Created by jayvee on 16/12/22. 7 | """ 8 | 9 | 10 | class InfoObj: 11 | """ 12 | 基础的信息类 13 | """ 14 | 15 | def __init__(self): 16 | pass 17 | 18 | def fill_details(self): 19 | """ 20 | 填充信息类的信息 21 | :return: 22 | """ 23 | pass 24 | 25 | 26 | class User(InfoObj): 27 | """ 28 | 用户类 29 | 30 | """ 31 | 32 | def __init__(self, uid): 33 | InfoObj.__init__(self) 34 | self.uid = uid 35 | self.playlist = [] 36 | self.details = {} 37 | self.__has_details = False 38 | # get user info 39 | 40 | def __fill_user_playlist(self): 41 | """ 42 | 填充用户歌单信息 43 | :return: None 44 | """ 45 | # get user playlist 46 | pl = user_playlist(self.uid) 47 | if pl != -1: 48 | self.playlist = pl 49 | else: 50 | data_process_logger.error('cannot get the playlist of user %s' % self.uid) 51 | 52 | def __fill_user_details(self): 53 | """ 54 | 填充用户信息 55 | :return: None 56 | """ 57 | u_details = user_profile(self.uid) 58 | if u_details != -1: 59 | self.details = u_details 60 | self.__has_details = True 61 | else: 62 | data_process_logger.error('cannot get the details of user %s' % self.uid) 63 | 64 | def fill_details(self): 65 | self.__fill_user_details() 66 | self.__fill_user_playlist() 67 | 68 | def __str__(self): 69 | return str(self.details) 70 | 71 | 72 | 73 | class Song(InfoObj): 74 | """ 75 | 歌曲类 76 | """ 77 | 78 | def __init__(self, sid): 79 | InfoObj.__init__(self) 80 | self.sid = sid 81 | self.details = {} 82 | self.__has_details = False 83 | 84 | def __fill_song_details(self): 85 | sd = song_detail(self.sid) 86 | if sd != -1: 87 | self.details = sd 88 | self.__has_details = True 89 | else: 90 | data_process_logger.error('cannot get the details of song %s' % self.sid) 91 | 92 | def fill_details(self): 93 | self.__fill_song_details() 94 | 95 | def __str__(self): 96 | return str(self.details) 97 | -------------------------------------------------------------------------------- /utils/logger_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from logging.handlers import RotatingFileHandler 4 | import os 5 | 6 | # 用以控制是否输出到屏幕,线上环境不输出到屏幕 7 | DebugConf = True 8 | # DebugConf = False 9 | 10 | abs_path = os.path.dirname(os.path.abspath(__file__)) 11 | abs_father_path = os.path.dirname(abs_path) 12 | log_dir_path = abs_father_path + '/log' 13 | if not os.path.exists(log_dir_path): 14 | os.makedirs(log_dir_path) 15 | 16 | data_analysis_logger = logging.getLogger('data_analysis') 17 | data_process_logger = logging.getLogger('data_process') 18 | model_logger = logging.getLogger('model') 19 | 20 | formatter = logging.Formatter( 21 | '[%(asctime)s][pid:%(process)s-tid:%(thread)s] %(module)s.%(funcName)s: %(levelname)s: %(message)s') 22 | 23 | # StreamHandler for print log to console 24 | hdr = logging.StreamHandler() 25 | hdr.setFormatter(formatter) 26 | hdr.setLevel(logging.DEBUG) 27 | 28 | # RotatingFileHandler 29 | fhr_ana = RotatingFileHandler('%s/analysis.log' % (log_dir_path), maxBytes=10 * 1024 * 1024, backupCount=3) 30 | fhr_ana.setFormatter(formatter) 31 | fhr_ana.setLevel(logging.DEBUG) 32 | 33 | # RotatingFileHandler 34 | fhr_pro = RotatingFileHandler('%s/process.log' % (log_dir_path), maxBytes=10 * 1024 * 1024, backupCount=3) 35 | fhr_pro.setFormatter(formatter) 36 | fhr_pro.setLevel(logging.DEBUG) 37 | 38 | # RotatingFileHandler 39 | fhr_model = RotatingFileHandler('%s/model.log' % (log_dir_path), maxBytes=10 * 1024 * 1024, backupCount=3) 40 | fhr_model.setFormatter(formatter) 41 | fhr_model.setLevel(logging.DEBUG) 42 | 43 | data_analysis_logger.addHandler(fhr_ana) 44 | if DebugConf: 45 | data_analysis_logger.addHandler(hdr) 46 | data_analysis_logger.setLevel(logging.DEBUG) # lowest debug level for logger 47 | else: 48 | data_analysis_logger.setLevel(logging.ERROR) # lowest debug level for logger 49 | 50 | data_process_logger.addHandler(fhr_pro) 51 | if DebugConf: 52 | data_process_logger.addHandler(hdr) 53 | data_process_logger.setLevel(logging.DEBUG) 54 | else: 55 | data_process_logger.setLevel(logging.ERROR) 56 | 57 | model_logger.addHandler(fhr_model) 58 | if DebugConf: 59 | model_logger.addHandler(hdr) 60 | model_logger.setLevel(logging.DEBUG) 61 | else: 62 | model_logger.setLevel(logging.ERROR) 63 | 64 | if __name__ == '__main__': 65 | ''' 66 | Usage: 67 | from tools.log_tools import data_process_logger as logger 68 | logger.debug('debug debug') 69 | ''' 70 | data_analysis_logger.debug('My logger configure success') 71 | data_analysis_logger.info('My logger configure success') 72 | data_analysis_logger.error('analysis error test') 73 | 74 | data_process_logger.info('My logger configure success~~') 75 | data_process_logger.error('process error test test') 76 | 77 | model_logger.info('Ohhh model') 78 | model_logger.error('error model') 79 | -------------------------------------------------------------------------------- /api_server/flask_app.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 17/2/19. 5 | https://github.com/JayveeHe 6 | """ 7 | import json 8 | 9 | import re 10 | from flask import Flask, render_template, request, make_response 11 | 12 | import os 13 | import sys 14 | 15 | abs_path = os.path.dirname(os.path.abspath(__file__)) 16 | abs_father_path = os.path.dirname(abs_path) 17 | PROJECT_PATH = abs_father_path 18 | print 'Used file: %s\nProject path=%s' % (__file__, PROJECT_PATH) 19 | sys.path.append(PROJECT_PATH) 20 | # add flask path 21 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 22 | 23 | from song2vec.song2vec_operator import Song2VecOperator 24 | from utils.logger_utils import data_process_logger 25 | 26 | app = Flask(__name__) 27 | 28 | data_process_logger.info('initing song2vec operator') 29 | s2v_operator = Song2VecOperator( 30 | song2vec_model_path='%s/datas/[full]50d_20iter_10win_5min_song2vec.model' % PROJECT_PATH, 31 | artist2vec_model_path='%s/datas/[full]50d_20iter_10win_5min_artist2vec.model' % PROJECT_PATH) 32 | data_process_logger.info('complete init song2vec') 33 | 34 | 35 | @app.route('/musictaster') 36 | def hello_world(): 37 | return render_template("demo.html") 38 | 39 | 40 | @app.route('/musictaster/similar/song', methods=['POST']) 41 | @app.route('/musictaster/similar/song/', methods=['GET']) 42 | def query_similar_songs(song_name=None): 43 | """ 44 | 查询最近似的歌曲,方法可以为GET或POST 45 | Args: 46 | song_name: 47 | 48 | Returns: 49 | 50 | """ 51 | try: 52 | if request.method == 'GET': 53 | top_n = int(request.args.get('top_n')) if request.args.get('top_n') else 10 54 | sim_res = s2v_operator.song2vec_model.most_similar(song_name.lower(), topn=top_n) 55 | elif request.method == 'POST': 56 | req_data_obj = json.loads(request.data) 57 | # 获取各组加减信息,并取小写字母(英文) 58 | positive_songs = lower_array(req_data_obj.get('positive_songs')) if req_data_obj.get( 59 | 'positive_songs') else [] 60 | negative_songs = lower_array(req_data_obj.get('negative_songs')) if req_data_obj.get( 61 | 'negative_songs') else [] 62 | positive_artists = lower_array(req_data_obj.get('positive_artists')) if req_data_obj.get( 63 | 'positive_artists') else [] 64 | negative_artists = lower_array(req_data_obj.get('negative_artists')) if req_data_obj.get( 65 | 'negative_artists') else [] 66 | top_n = int(req_data_obj.get('top_n')) if req_data_obj.get('top_n') else 10 67 | sim_res = s2v_operator.calc_song_similar(positive_songs=positive_songs, 68 | negative_songs=negative_songs, 69 | positive_artists=positive_artists, 70 | negative_artists=negative_artists, 71 | topn=top_n) 72 | else: 73 | sim_res = [] 74 | # parse similar result 75 | parsed_sim_res = [{'name': a[0], 'similarity': a[1]} for a in sim_res] 76 | result = {'code': 200, 'result': parsed_sim_res} 77 | resp = make_response(json.dumps(result, ensure_ascii=False), 200) 78 | except Exception, e: 79 | res = {'code': 400, 'error_msg': e.message} 80 | resp = make_response(json.dumps(res, ensure_ascii=False), 200) 81 | resp.mimetype = 'application/json' 82 | return resp 83 | 84 | 85 | @app.route('/musictaster/similar/artist', methods=['POST']) 86 | @app.route('/musictaster/similar/artist/', methods=['GET']) 87 | def query_similar_artist(artist_name=None): 88 | try: 89 | if request.method == 'GET': 90 | top_n = int(request.args.get('top_n')) if request.args.get('top_n') else 10 91 | sim_res = s2v_operator.artist2vec_model.most_similar(artist_name.lower(), topn=top_n) 92 | elif request.method == 'POST': 93 | req_data_obj = json.loads(request.data) 94 | # 获取各组加减信息,并取小写字母(英文) 95 | positive_songs = lower_array(req_data_obj.get('positive_songs')) if req_data_obj.get( 96 | 'positive_songs') else [] 97 | negative_songs = lower_array(req_data_obj.get('negative_songs')) if req_data_obj.get( 98 | 'negative_songs') else [] 99 | positive_artists = lower_array(req_data_obj.get('positive_artists')) if req_data_obj.get( 100 | 'positive_artists') else [] 101 | negative_artists = lower_array(req_data_obj.get('negative_artists')) if req_data_obj.get( 102 | 'negative_artists') else [] 103 | top_n = req_data_obj.get('top_n') if req_data_obj.get('top_n') else 10 104 | sim_res = s2v_operator.calc_artist_similar(positive_songs=positive_songs, 105 | negative_songs=negative_songs, 106 | positive_artists=positive_artists, 107 | negative_artists=negative_artists, 108 | topn=top_n) 109 | else: 110 | sim_res = [] 111 | # parse similar result 112 | parsed_sim_res = [{'name': a[0], 'similarity': a[1]} for a in sim_res] 113 | result = {'code': 200, 'result': parsed_sim_res} 114 | resp = make_response(json.dumps(result, ensure_ascii=False), 200) 115 | except Exception, e: 116 | res = {'code': 400, 'error_msg': e.message} 117 | resp = make_response(json.dumps(res, ensure_ascii=False), 200) 118 | resp.mimetype = 'application/json' 119 | return resp 120 | 121 | 122 | @app.route('/musictaster/cluster/playlist/id/', methods=['GET']) 123 | def cluster_playlist_by_plid(plid=None): 124 | try: 125 | if request.args.get('cluster_n'): 126 | cluster_n = eval(request.args.get('cluster_n')) 127 | else: 128 | cluster_n = 5 129 | if request.args.get('type'): 130 | cluster_type = request.args.get('type') 131 | else: 132 | cluster_type = 'song' 133 | if cluster_type == 'artist': 134 | cluster_res, playlist_name = s2v_operator.cluster_artist_in_playlist(plid, cluster_n=cluster_n) 135 | else: 136 | cluster_res, playlist_name = s2v_operator.cluster_song_in_playlist(plid, cluster_n=cluster_n) 137 | result = {'code': 200, 'result': cluster_res, 'playlist_name': playlist_name, 'type': cluster_type} 138 | resp = make_response(json.dumps(result, ensure_ascii=False), 200) 139 | resp.mimetype = 'application/json' 140 | except Exception, e: 141 | res = {'code': 400, 'error_msg': e.message} 142 | resp = make_response(json.dumps(res, ensure_ascii=False), 200) 143 | resp.mimetype = 'application/json' 144 | return resp 145 | 146 | 147 | @app.route('/musictaster/cluster/playlist/url', methods=['POST']) 148 | def cluster_playlist_by_url(): 149 | try: 150 | if len(request.data): 151 | req_obj = json.loads(request.data) 152 | else: 153 | req_obj = request.form 154 | url = req_obj['url'] 155 | cluster_type = req_obj['type'] 156 | is_detailed = req_obj.get('is_detailed') if req_obj.get('is_detailed') else False 157 | plid = re.findall('\d{4,}', url)[0] 158 | if request.args.get('cluster_n'): 159 | cluster_n = eval(request.args.get('cluster_n')) 160 | else: 161 | cluster_n = 5 162 | 163 | if cluster_type == 'artist': 164 | cluster_res, playlist_name, detail_infos = s2v_operator.cluster_artist_in_playlist(plid, 165 | cluster_n=cluster_n, 166 | is_detailed=is_detailed) 167 | else: 168 | cluster_res, playlist_name, detail_infos = s2v_operator.cluster_song_in_playlist(plid, cluster_n=cluster_n, 169 | is_detailed=is_detailed) 170 | if is_detailed: 171 | result = {'code': 200, 'result': cluster_res, 'playlist_name': playlist_name, 'type': cluster_type, 172 | 'detail_infos': detail_infos} 173 | else: 174 | result = {'code': 200, 'result': cluster_res, 'playlist_name': playlist_name, 'type': cluster_type} 175 | resp = make_response(json.dumps(result, ensure_ascii=False), 200) 176 | resp.mimetype = 'application/json' 177 | except Exception, e: 178 | res = {'code': 400, 'error_msg': e.message} 179 | resp = make_response(json.dumps(res, ensure_ascii=False), 200) 180 | resp.mimetype = 'application/json' 181 | return resp 182 | 183 | 184 | def lower_array(arr): 185 | return [a.lower() for a in arr] 186 | 187 | 188 | if __name__ == '__main__': 189 | app.run(host='0.0.0.0', port=2335, debug=False) 190 | -------------------------------------------------------------------------------- /api_server/static/js/music_taster.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by jayvee on 17/3/15. 3 | * https://github.com/JayveeHe 4 | */ 5 | 6 | 7 | // 歌单聚类 8 | $("#btn_send").click(function () { 9 | var pl_url = $("#input_url").val(); 10 | var send_datas = { 11 | url: pl_url, 12 | type: "song" 13 | }; 14 | $.ajax('/cluster/playlist/url', { 15 | 'data': JSON.stringify(send_datas), //{action:'x',params:['a','b','c']} 16 | 'type': 'POST', 17 | 'processData': false, 18 | 'contentType': 'application/json' //typically 'application/x-www-form-urlencoded', but the service you are calling may expect 'text/json'... check with the service to see what they expect as content-type in the HTTP header. 19 | }).done(function (data, status) { 20 | $("#show_cluster").modal('toggle'); 21 | if (status == "success") { 22 | // parse cluster result 23 | //解析聚类结果 24 | var resp_obj = eval(data); 25 | if (resp_obj['code'] == 200) { 26 | var title = $("#playlist_name_title"); 27 | console.log(resp_obj['playlist_name']); 28 | title.val(resp_obj['playlist_name'] + "\t" + title.val()); 29 | /// -------- d3.js ------- 30 | //准备复杂网络数据 31 | var g = {"nodes": [], "links": []}; 32 | var cluster_result = resp_obj["result"]; 33 | g.nodes.push({'id': 'root', 'group': -1, 'label': 'root'}); 34 | for (var i = 0; i < cluster_result.length; i++) { 35 | console.log(cluster_result[i]); 36 | var item = cluster_result[i][0]; 37 | /**var c_color = '#' + (Math.floor(Math.random() * 16777215).toString(16) + '000000').substr(0, 6); 38 | var c_x = (Math.random() - 0.5) * 50; 39 | var c_y = (Math.random() - 0.5) * 50;**/ 40 | g.nodes.push({ 41 | "id": item, 42 | "group": i, 43 | 'label': item 44 | }); 45 | g.links.push({ 46 | 'source': 'root', 47 | 'target': item, 48 | 'value': 1 49 | }); 50 | var c_root_id = item; 51 | var last_item = c_root_id; 52 | for (var j = 1; j < cluster_result[i].length; j++) { 53 | //console.log(i + "-" + j); 54 | var n_item = cluster_result[i][j]; 55 | g.nodes.push({ 56 | "id": n_item, 57 | "group": i, 58 | 'label': n_item 59 | }); 60 | g.links.push({ 61 | 'value': 1, 62 | 'source': c_root_id, 63 | 'target': n_item 64 | }); 65 | } 66 | } 67 | 68 | // d3 初始化 69 | var svg = d3.select("svg"); 70 | 71 | svg.selectAll('*').remove(); 72 | var c_canvas = $('#cluster_canvas'); 73 | var svg_width = 1000 * 0.9; 74 | var svg_height = 500 * 0.9; 75 | //console.log(c_canvas.parentElement().width + '-' + c_canvas.parentElement().height); 76 | console.log('svg_width:' + svg_width + '\theight:' + svg_height); 77 | 78 | var color = d3.scaleOrdinal(d3.schemeCategory20); 79 | 80 | var simulation = d3.forceSimulation() 81 | .force("link", d3.forceLink().id(function (d) { 82 | return d.id; 83 | })) 84 | .force("charge", d3.forceManyBody()) 85 | .force("center", d3.forceCenter(svg_width / 2, svg_height / 2)); 86 | 87 | 88 | var link = svg.append("g") 89 | .attr("class", "links") 90 | .selectAll("line") 91 | .data(g.links) 92 | .enter().append("line") 93 | .attr("stroke-width", function (d) { 94 | return Math.sqrt(d.value); 95 | }); 96 | 97 | var node = svg.append("g") 98 | .attr("class", "nodes") 99 | .selectAll("circle") 100 | .data(g.nodes) 101 | .enter() 102 | .append("circle") 103 | .attr("r", 5) 104 | .attr("fill", function (d) { 105 | return color(d.group); 106 | }) 107 | .call(d3.drag() 108 | .on("start", dragstarted) 109 | .on("drag", dragged) 110 | .on("end", dragended)); 111 | 112 | var anchorNode = svg.append('g').attr('class', 'labels').selectAll("g.labels").data(g.nodes) 113 | .enter().append("svg:text").text(function (d) { 114 | return d.label; 115 | }).style("fill", "#555").style("font-family", "Arial").style("font-size", 6) 116 | .call(d3.drag() 117 | .on("start", dragstarted) 118 | .on("drag", dragged) 119 | .on("end", dragended)); 120 | //anchorNode.append("svg:circle").attr("r", 0).style("fill", "#FFF"); 121 | 122 | 123 | simulation 124 | .nodes(g.nodes) 125 | .on("tick", ticked); 126 | 127 | simulation.force("link") 128 | .links(g.links); 129 | var zoom = d3.zoom() 130 | .on("zoom", zoomed); 131 | 132 | svg 133 | .on("wheel", wheeled) 134 | .call(zoom) 135 | .call(zoom.transform, d3.zoomIdentity 136 | .translate(svg_width / 2, svg_height / 2) 137 | .scale(0.5) 138 | .translate(-svg_width / 2, -svg_height / 2)); 139 | svg.call(zoom); 140 | 141 | function wheeled() { 142 | console.log(d3.event); 143 | } 144 | 145 | function zoomed() { 146 | node.attr("transform", d3.event.transform); 147 | link.attr("transform", d3.event.transform); 148 | anchorNode.attr("transform", d3.event.transform); 149 | } 150 | 151 | function ticked() { 152 | link 153 | .attr("x1", function (d) { 154 | return d.source.x; 155 | }) 156 | .attr("y1", function (d) { 157 | return d.source.y; 158 | }) 159 | .attr("x2", function (d) { 160 | return d.target.x; 161 | }) 162 | .attr("y2", function (d) { 163 | return d.target.y; 164 | }); 165 | 166 | node 167 | .attr("cx", function (d) { 168 | return d.x; 169 | }) 170 | .attr("cy", function (d) { 171 | return d.y; 172 | }); 173 | anchorNode 174 | .attr("x", function (d) { 175 | return d.x; 176 | }) 177 | .attr("y", function (d) { 178 | return d.y; 179 | }); 180 | } 181 | 182 | function dragstarted(d) { 183 | if (!d3.event.active) simulation.alphaTarget(0.3).restart(); 184 | d.fx = d.x; 185 | d.fy = d.y; 186 | } 187 | 188 | function dragged(d) { 189 | d.fx = d3.event.x; 190 | d.fy = d3.event.y; 191 | } 192 | 193 | function dragended(d) { 194 | if (!d3.event.active) simulation.alphaTarget(0); 195 | d.fx = null; 196 | d.fy = null; 197 | } 198 | 199 | 200 | } 201 | else { 202 | alert("请求错误,详情=" + resp_obj.toString()); 203 | } 204 | } 205 | else { 206 | alert("请求失败"); 207 | } 208 | } 209 | ).fail(function () { 210 | alert("请求失败"); 211 | }); 212 | }); 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /pipelines/fetch_user_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | Created by jayvee on 16/12/22. 4 | """ 5 | import time 6 | 7 | from utils.cloudmusic_api import * 8 | from utils.cloudmusic_dao import CloudMusicDAO 9 | from utils.db_utils import get_db_inst 10 | 11 | 12 | def test(): 13 | # u = User('2886507') 14 | # u.fill_details() 15 | # print u 16 | uname = '' # 填入用户名,手机登录 17 | pwd = '' 18 | fetch_login_userdata(username=uname, password=pwd) 19 | 20 | 21 | def fetch_login_userdata(username, password): 22 | """ 23 | 以当前登录用户为起点,获取各类信息 24 | :return: 25 | """ 26 | user_info = user_login(username=username, password=password) 27 | if user_info != -1 and user_info['code'] == 200: 28 | # user_profile = user_info['profile'] 29 | # uid = user_profile['userId'] 30 | # upl = user_playlist(uid) 31 | # print len(upl) 32 | return user_info 33 | else: 34 | data_process_logger.warn('fetch login userdata failed') 35 | 36 | 37 | def fetch_user_networks(start_id=None, max_user_count=5000): 38 | """ 39 | 启动用户信息爬取的函数 40 | Args: 41 | max_user_count: 本次最大爬取的用户数 42 | start_id: 入口id,如果没有则在数据库中任取一个 43 | 44 | Returns: 45 | 46 | """ 47 | db_userinfo = get_db_inst('MusicTaster', 'UserInfos') 48 | DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos') 49 | # start_info = user_login('13717951224', 'hejiawei') 50 | # u_profile = user_profile(start_id) 51 | if not start_id: 52 | start_id = db_userinfo.find_one()['userId'] 53 | idlist = set() 54 | idlist.add(start_id) 55 | # save start user info 56 | cur_id = start_id 57 | followlist = user_follows(cur_id) 58 | for i in followlist: 59 | idlist.add(i['userId']) 60 | # result_count = find_result.count() 61 | user_count = 0 62 | while len(idlist) > 0 and user_count < max_user_count and cur_id: 63 | if db_userinfo.find({'userId': cur_id}).count() != 0: 64 | # slp = random.random() * 1 + 0.5 65 | data_process_logger.info('[SKIP] No.%s User %s skip!' % (user_count, cur_id)) 66 | # data_process_logger.info('sleep %s sec' % slp) 67 | user_count += 1 68 | cur_id = idlist.pop() 69 | continue 70 | u_profile = user_profile(cur_id) 71 | # db_userinfo.insert(u_profile) 72 | followlist = user_follows(cur_id) 73 | fanlist = user_fans(cur_id) 74 | u_profile['follows'] = followlist 75 | u_profile['fans'] = fanlist 76 | followids = [] 77 | fanids = [] 78 | for userinfo in followlist: 79 | int_id = userinfo['userId'] 80 | followids.append(int_id) 81 | idlist.add(int_id) 82 | for userinfo in fanlist: 83 | int_id = userinfo['userId'] 84 | fanids.append(int_id) 85 | idlist.add(int_id) 86 | u_profile['follow_ids'] = followids 87 | u_profile['follow_count'] = len(followids) 88 | u_profile['fan_ids'] = fanids 89 | u_profile['fan_count'] = len(fanids) 90 | DAO_inst.save_unique_item(u_profile) 91 | data_process_logger.info('[OK] No.%s User %s, nickname = %s ok! %s users left' % ( 92 | user_count, cur_id, u_profile['nickname'], len(idlist))) 93 | slp = random.random() * 2 + 1 94 | data_process_logger.info('sleep %s sec' % slp) 95 | time.sleep(slp) 96 | cur_id = idlist.pop() 97 | user_count += 1 98 | # result_count = db_userinfo.find({'userId': cur_id}).count() 99 | print 'done' 100 | 101 | 102 | def fetch_playlist(max_user_count=100): 103 | """ 104 | 进行用户歌单的抓取,同时更新UserInfos、SongInfos和Plyalists三个数据库的信息 105 | Args: 106 | max_user_count: 最大抓取的用户数 107 | 108 | Returns: 109 | 无 110 | """ 111 | user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos') 112 | playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') 113 | song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') 114 | userid_list = user_dao_inst.db_inst.find({"playlists": {'$exists': False}}).distinct('userId') 115 | # random.shuffle(userid_list) 116 | count = 0 117 | for uid in userid_list[:max_user_count]: 118 | # count = 0 119 | userinfo = user_dao_inst.db_inst.find_one({"userId": uid}) 120 | # fetch playlist ids 121 | user_playlists = user_playlist(uid, limit=2000) 122 | data_process_logger.info( 123 | 'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists))) 124 | if len(user_playlists): 125 | for i in range(len(user_playlists)): 126 | pl_info = user_playlists[i] 127 | data_process_logger.info( 128 | 'processing %s No.%s playlist: %s, total song: %s' % ( 129 | userinfo['nickname'], i, pl_info['name'], pl_info['trackCount'])) 130 | # fetch playlist details 131 | # 首先查看是否在数据库中有 132 | pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']}) 133 | if not pl_obj: 134 | try: 135 | pl_obj = playlist_detail(pl_info['id']) 136 | pl_song_ids = [] 137 | if pl_obj != -1: 138 | for song in pl_obj['tracks']: 139 | song_dao_inst.save_unique_item(song, primary_key='id') 140 | pl_song_ids.append(song['id']) 141 | # 在playlist中保存track信息,只保存编号 142 | user_playlists[i]['tracks_ids'] = pl_song_ids 143 | pl_obj['tracks_ids'] = pl_song_ids 144 | playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True) 145 | slp = random.random() * 2 + 1 146 | # data_process_logger.info('sleep %s sec' % slp) 147 | time.sleep(slp) 148 | else: 149 | data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name'])) 150 | except Exception, e: 151 | print e 152 | else: 153 | user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids'] 154 | 155 | # 在userinfo中保存playlist信息 156 | userinfo['playlists'] = user_playlists 157 | user_dao_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True, is_inform=True) 158 | data_process_logger.info('No.%s %s playlist handled!' % (count, userinfo['nickname'])) 159 | slp = random.random() * 2 + 1 160 | data_process_logger.info('sleep %s sec' % slp) 161 | time.sleep(slp) 162 | count += 1 163 | print 'done' 164 | 165 | 166 | def get_user_playlist(uid): 167 | user_dao_inst = CloudMusicDAO('MusicTaster', 'UserInfos') 168 | playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') 169 | song_dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') 170 | # count = 0 171 | userinfo = user_dao_inst.db_inst.find_one({"userId": uid}) 172 | # fetch playlist ids 173 | user_playlists = user_playlist(uid, limit=2000) 174 | data_process_logger.info( 175 | 'processing the playlist of %s\nTotal playlist = %s' % (userinfo['nickname'], len(user_playlists))) 176 | if len(user_playlists): 177 | for i in range(len(user_playlists)): 178 | pl_info = user_playlists[i] 179 | data_process_logger.info( 180 | 'processing %s No.%s playlist: %s, total song: %s' % ( 181 | userinfo['nickname'], i, pl_info['name'], pl_info['trackCount'])) 182 | # fetch playlist details 183 | # 首先查看是否在数据库中有 184 | pl_obj = playlist_dao_inst.db_inst.find_one({'id': pl_info['id']}) 185 | if not pl_obj: 186 | try: 187 | pl_obj = playlist_detail(pl_info['id']) 188 | pl_song_ids = [] 189 | if pl_obj != -1: 190 | for song in pl_obj['tracks']: 191 | song_dao_inst.save_unique_item(song, primary_key='id') 192 | pl_song_ids.append(song['id']) 193 | # 在playlist中保存track信息,只保存编号 194 | user_playlists[i]['tracks_ids'] = pl_song_ids 195 | pl_obj['tracks_ids'] = pl_song_ids 196 | playlist_dao_inst.save_unique_item(pl_obj, primary_key='id', is_inform=True) 197 | slp = random.random() * 2 + 1 198 | # data_process_logger.info('sleep %s sec' % slp) 199 | time.sleep(slp) 200 | else: 201 | data_process_logger.error('cannot fetch %s %s' % (pl_info['id'], pl_info['name'])) 202 | except Exception, e: 203 | print e 204 | else: 205 | user_playlists[i]['tracks_ids'] = pl_obj['tracks_ids'] 206 | 207 | # 在userinfo中保存playlist信息 208 | userinfo['playlists'] = user_playlists 209 | user_dao_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True, is_inform=True) 210 | data_process_logger.info('%s playlist handled!' % (userinfo['nickname'])) 211 | 212 | 213 | if __name__ == '__main__': 214 | # login_user_info = fetch_login_userdata('', '') 215 | # start_id = login_user_info['profile']['userId'] 216 | tmp_id = 2886507 217 | # get_user_playlist(tmp_id) 218 | # fill_song_comments() 219 | # fetch_user_networks() 220 | fetch_playlist(max_user_count=1000) 221 | # update_userinfo() 222 | # test() 223 | -------------------------------------------------------------------------------- /utils/cloudmusic_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import hashlib 3 | import json 4 | import random 5 | from urllib import urlencode 6 | 7 | import requests 8 | 9 | from utils.encrypt_utils import encrypted_request 10 | from utils.logger_utils import data_process_logger 11 | 12 | """ 13 | Created by jayvee on 16/12/14. 14 | 主要用于不加处理地调用云音乐的API,返回数据均为remote端原始数据 15 | """ 16 | 17 | # config_infos = get_config() 18 | # csrf_token = config_infos['csrf_token'] 19 | 20 | header = { 21 | 'Accept': '*/*', 22 | 'Accept-Encoding': 'gzip,deflate,sdch', 23 | 'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4', 24 | 'Connection': 'keep-alive', 25 | 'Content-Type': 'application/x-www-form-urlencoded', 26 | 'Host': 'music.163.com', 27 | 'Referer': 'http://music.163.com/', 28 | 'User-Agent': 29 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36' 30 | } 31 | cookies = {'appver': '1.5.2'} 32 | 33 | # proxylist = [{}, {'http': '138.197.118.48:80'}, {'http': '104.196.224.28:80'}] 34 | proxylist = [None] 35 | 36 | retry_times = 3 37 | 38 | 39 | def user_login(username, password): 40 | """ 41 | 用户登录api(手机登录) 42 | 43 | Args: 44 | username: 用户账号,手机号 45 | password: 密码 46 | Returns: 47 | result: a json obj of user data 48 | """ 49 | base_url = 'https://music.163.com/weapi/login/cellphone' 50 | login_url = 'https://music.163.com/weapi/login/' 51 | password = hashlib.md5(password).hexdigest() 52 | text = { 53 | 'phone': username, 54 | 'password': password, 55 | 'rememberLogin': 'true' 56 | } 57 | data = encrypted_request(text) 58 | # s = requests.session() 59 | # s.headers = header 60 | try: 61 | res = requests.post(base_url, data=data, headers=header, proxies=random.choice(proxylist)).content 62 | result = json.loads(res) 63 | return result 64 | except Exception, e: 65 | data_process_logger.error('%s login failed, reason = %s' % (username, e)) 66 | return -1 67 | 68 | 69 | def playlist_detail(playlist_id, limit=1000): 70 | """ 71 | 根据歌单id获取歌单的详情 72 | 73 | Args: 74 | playlist_id: 75 | limit:最大歌曲数为1000 76 | """ 77 | for i in range(retry_times): 78 | try: 79 | base_url = 'http://music.163.com/api/playlist/detail?id=%s&limit=%s' % (playlist_id, limit) 80 | res = requests.get(base_url, headers=header).content 81 | # print res 82 | jsonobj = json.loads(res) 83 | if jsonobj['code'] == 200: 84 | return json.loads(res)['result'] 85 | else: 86 | data_process_logger.error('error! result = %s' % res) 87 | except Exception, e: 88 | data_process_logger.error('%s playlist failed, reason = %s' % (playlist_id, e)) 89 | data_process_logger.warn('%s playlist retrying...' % (playlist_id)) 90 | continue 91 | return -1 92 | 93 | 94 | def user_playlist(uid, offset=0, limit=1000): 95 | """ 96 | 根据用户id获取用户歌单编号 97 | :param uid: 98 | :param offset: 99 | :param limit: 100 | :return: 101 | """ 102 | base_url = 'http://music.163.com/api/user/playlist/?offset=%s&limit=%s&uid=%s' % (offset, limit, uid) 103 | # data = {'offset': offset, 'limit': limit, 'uid': uid} 104 | for i in range(retry_times): 105 | try: 106 | # data = urlencode(data) 107 | res = requests.get(base_url, headers=header).content 108 | data = json.loads(res) 109 | return data['playlist'] 110 | except (requests.exceptions.RequestException, KeyError) as e: 111 | data_process_logger.error(e) 112 | data_process_logger.warn('retrying...') 113 | return -1 114 | 115 | 116 | def song_detail(song_ids, offset=0): 117 | """ 118 | 根据歌曲id列表批量获取歌曲详情 119 | :param song_ids: 120 | :return: 121 | """ 122 | tmpids = song_ids[offset:] 123 | tmpids = tmpids[0:100] 124 | tmpids = list(map(str, tmpids)) 125 | base_url = 'http://music.163.com/api/song/detail?ids=[%s]' % ( # NOQA 126 | ','.join(tmpids)) 127 | for i in range(retry_times): 128 | try: 129 | data = json.loads(requests.get(base_url).content) 130 | # the order of data['songs'] is no longer the same as tmpids, 131 | # so just make the order back 132 | data['songs'].sort(key=lambda song: tmpids.index(str(song['id']))) 133 | return data['songs'] 134 | except requests.exceptions.RequestException as e: 135 | data_process_logger.error(e) 136 | data_process_logger.warn('retrying...') 137 | return [] 138 | 139 | 140 | def user_profile(uid): 141 | """ 142 | 根据uid获取用户详情(通过歌单列表间接获取creator信息) 143 | :param uid: 144 | :return: 145 | """ 146 | # upl = user_playlist(uid, limit=1) 147 | base_url = 'http://music.163.com/api/user/playlist/?offset=%s&limit=%s&uid=%s' % (0, 0, uid) 148 | # data = {'offset': offset, 'limit': limit, 'uid': uid} 149 | for i in range(retry_times): 150 | try: 151 | # data = urlencode(data) 152 | res = requests.get(base_url, headers=header, proxies=random.choice(proxylist)).content 153 | data = json.loads(res) 154 | return data['playlist'][0]['creator'] 155 | except (requests.exceptions.RequestException, KeyError) as e: 156 | data_process_logger.error(e) 157 | data_process_logger.warn('retrying...') 158 | continue 159 | return -1 160 | 161 | 162 | def user_follows(uid): 163 | """ 164 | 根据uid获取用户关注列表 165 | :param uid: 166 | :return: 167 | """ 168 | # upl = user_playlist(uid, limit=1) 169 | base_url = 'http://music.163.com/weapi/user/getfollows/%s' % (uid) 170 | # data = {'offset': offset, 'limit': limit, 'uid': uid} 171 | for i in range(retry_times): 172 | try: 173 | # data = urlencode(data) 174 | text = { 175 | 'limit': 1000, 'offset': 0 176 | } 177 | data = encrypted_request(text) 178 | res = requests.post(base_url, data=data, headers=header, proxies=random.choice(proxylist)).content 179 | data = json.loads(res) 180 | return data['follow'] 181 | except (requests.exceptions.RequestException, KeyError) as e: 182 | data_process_logger.error(e) 183 | data_process_logger.warn('retrying...') 184 | continue 185 | return -1 186 | 187 | 188 | def user_fans(uid): 189 | """ 190 | 根据uid获取用户粉丝列表 191 | :param uid: 192 | :return: 193 | """ 194 | # upl = user_playlist(uid, limit=1) 195 | base_url = 'http://music.163.com/weapi/user/getfolloweds/' 196 | # data = {'offset': offset, 'limit': limit, 'uid': uid} 197 | for i in range(retry_times): 198 | try: 199 | # data = urlencode(data) 200 | text = { 201 | 'userId': uid, 202 | 'limit': 1000, 'offset': 0 203 | } 204 | data = encrypted_request(text) 205 | res = requests.post(base_url, data=data, headers=header, proxies=random.choice(proxylist)).content 206 | data = json.loads(res) 207 | return data['followeds'] 208 | except (requests.exceptions.RequestException, KeyError) as e: 209 | data_process_logger.error(e) 210 | data_process_logger.warn('retrying...') 211 | continue 212 | return -1 213 | 214 | 215 | def song_comments(commentThreadId, limit=10, offset=0): 216 | """ 217 | 根据commentThreadId获取歌曲评论 218 | 219 | """ 220 | # upl = user_playlist(uid, limit=1) 221 | base_url = 'http://music.163.com/weapi/v1/resource/comments/%s/' % commentThreadId 222 | # data = {'offset': offset, 'limit': limit, 'uid': uid} 223 | for i in range(retry_times): 224 | try: 225 | # data = urlencode(data) 226 | text = { 227 | 'limit': limit, 'offset': offset 228 | } 229 | data = encrypted_request(text) 230 | res = requests.post(base_url, data=data, headers=header, proxies=random.choice(proxylist)).content 231 | data = json.loads(res) 232 | if data['code'] == 200: 233 | return data 234 | else: 235 | print 'error, details = %s' % data 236 | except (requests.exceptions.RequestException, KeyError) as e: 237 | data_process_logger.error(e) 238 | data_process_logger.warn('retrying...') 239 | continue 240 | return -1 241 | 242 | 243 | def search_web(s_name, type, limit=10): 244 | """ 245 | 网页搜索api 246 | :param s_name: 搜索词 247 | :param type: 搜索类型:1 单曲;10 专辑;100 歌手;1000 歌单;1002 用户 248 | :param limit: 249 | :return: 250 | """ 251 | data = {"s": s_name, "type": type, "limit": limit} 252 | search_url = 'http://music.163.com/api/search/get/web' 253 | for i in range(retry_times): 254 | try: 255 | d = urlencode(data) 256 | res = requests.post(search_url, d, headers=header, proxies=random.choice(proxylist)).content 257 | data = json.loads(res) 258 | return data 259 | except (requests.exceptions.RequestException, KeyError) as e: 260 | data_process_logger.error(e) 261 | data_process_logger.warn('retrying...') 262 | return -1 263 | 264 | 265 | if __name__ == '__main__': 266 | pass 267 | comms = song_comments('R_SO_4_26612932') 268 | print 'd' 269 | # a = playlist_detail(326069502, limit=500) 270 | # print a 271 | # b = user_playlist('2886507', limit=1000) 272 | # print b 273 | # c = song_detail(['37239018', '23']) 274 | # d = user_profile('2886507') 275 | # f = user_follows('2886507') 276 | # fans = user_fans('2886507') 277 | # i = user_infos('2886507') 278 | # print search_web('jayvee he', '1002', 10) 279 | -------------------------------------------------------------------------------- /song2vec/song2vec_operator.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 17/2/22. 5 | https://github.com/JayveeHe 6 | """ 7 | import pickle 8 | 9 | import cPickle 10 | from gensim import matutils 11 | from gensim.models.word2vec_inner import REAL 12 | from numpy.core.multiarray import ndarray, array, dot 13 | from sklearn.cluster import AffinityPropagation 14 | 15 | from utils.cloudmusic_api import playlist_detail 16 | from utils.logger_utils import data_process_logger 17 | 18 | 19 | class Song2VecOperator: 20 | def __init__(self, song2vec_model_path=None, artist2vec_model_path=None): 21 | """ 22 | 初始化,需要填入两种模型的地址 23 | Args: 24 | song2vec_model_path: 25 | artist2vec_model_path: 26 | """ 27 | try: 28 | if song2vec_model_path: 29 | with open(song2vec_model_path, 'rb') as s2v_file: 30 | self.song2vec_model = cPickle.load(s2v_file) 31 | print self.song2vec_model.estimate_memory() 32 | if artist2vec_model_path: 33 | with open(artist2vec_model_path, 'rb') as a2v_file: 34 | self.artist2vec_model = cPickle.load(a2v_file) 35 | print self.artist2vec_model.estimate_memory() 36 | self.song2vec_model.init_sims() 37 | self.artist2vec_model.init_sims() 38 | except IOError, ioe: 39 | print '%s' % ioe 40 | 41 | def calc_song_similar(self, positive_songs=[], negative_songs=[], 42 | positive_artists=[], negative_artists=[], 43 | song_weight=1.0, artist_weight=1.5, 44 | topn=10, restrict_vocab=None): 45 | """ 46 | 计算歌曲和歌手的加减相似度,求出最近似的歌曲top n 47 | Args: 48 | topn: 49 | restrict_vocab: 50 | artist_weight: 51 | song_weight: 52 | positive_songs: 53 | negative_songs: 54 | positive_artists: 55 | negative_artists: 56 | 57 | Returns: 58 | 59 | """ 60 | try: 61 | positive_songs = [(word, song_weight) for word in positive_songs] 62 | negative_songs = [(word, -song_weight) for word in negative_songs] 63 | positive_artists = [(word, artist_weight) for word in positive_artists] 64 | negative_artists = [(word, -artist_weight) for word in negative_artists] 65 | all_words, mean = set(), [] 66 | if positive_songs + negative_songs: 67 | for song, weight in positive_songs + negative_songs: 68 | song = song.strip() 69 | if isinstance(song, ndarray): 70 | mean.append(weight * song) 71 | elif song in self.song2vec_model.vocab: 72 | mean.append(weight * self.song2vec_model.syn0norm[self.song2vec_model.vocab[song].index]) 73 | all_words.add(self.song2vec_model.vocab[song].index) 74 | else: 75 | raise KeyError("song '%s' not in vocabulary" % song) 76 | # limited = self.song2vec_model.syn0norm if restrict_vocab is None \ 77 | # else self.song2vec_model.syn0norm[:restrict_vocab] 78 | if positive_artists + negative_artists: 79 | for artist, weight in positive_artists + negative_artists: 80 | if isinstance(word, ndarray): 81 | mean.append(weight * artist) 82 | elif word in self.artist2vec_model.vocab: 83 | mean.append(weight * self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index]) 84 | all_words.add(self.artist2vec_model.vocab[artist].index) 85 | else: 86 | raise KeyError("artist '%s' not in vocabulary" % artist) 87 | if not mean: 88 | raise ValueError("cannot compute similarity with no input") 89 | mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) 90 | limited = self.song2vec_model.syn0norm if restrict_vocab is None \ 91 | else self.song2vec_model.syn0norm[:restrict_vocab] 92 | # limited += self.artist2vec_model.syn0norm if restrict_vocab is None \ 93 | # else self.artist2vec_model.syn0norm[:restrict_vocab] 94 | dists = dot(limited, mean) 95 | if not topn: 96 | return dists 97 | best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) 98 | # ignore (don't return) words from the input 99 | result = [(self.song2vec_model.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] 100 | return result[:topn] 101 | except Exception, e: 102 | print 'error = %s' % e 103 | raise e 104 | 105 | def calc_artist_similar(self, positive_songs=[], negative_songs=[], 106 | positive_artists=[], negative_artists=[], 107 | song_weight=1.0, artist_weight=1.5, 108 | topn=10, restrict_vocab=None): 109 | """ 110 | 计算歌曲和歌手的加减相似度,求出最近似的歌手top n 111 | Args: 112 | topn: 113 | restrict_vocab: 114 | artist_weight: 115 | song_weight: 116 | positive_songs: 117 | negative_songs: 118 | positive_artists: 119 | negative_artists: 120 | 121 | Returns: 122 | 123 | """ 124 | try: 125 | positive_songs = [(word, song_weight) for word in positive_songs] 126 | negative_songs = [(word, -song_weight) for word in negative_songs] 127 | positive_artists = [(word, artist_weight) for word in positive_artists] 128 | negative_artists = [(word, -artist_weight) for word in negative_artists] 129 | all_words, mean = set(), [] 130 | if positive_songs + negative_songs: 131 | for song, weight in positive_songs + negative_songs: 132 | if isinstance(song, ndarray): 133 | mean.append(weight * song) 134 | elif song in self.song2vec_model.vocab: 135 | mean.append(weight * self.song2vec_model.syn0norm[self.song2vec_model.vocab[song].index]) 136 | all_words.add(self.song2vec_model.vocab[song].index) 137 | else: 138 | raise KeyError("song '%s' not in vocabulary" % song) 139 | # limited = self.song2vec_model.syn0norm if restrict_vocab is None \ 140 | # else self.song2vec_model.syn0norm[:restrict_vocab] 141 | if positive_artists + negative_artists: 142 | for artist, weight in positive_artists + negative_artists: 143 | if isinstance(word, ndarray): 144 | mean.append(weight * artist) 145 | elif word in self.artist2vec_model.vocab: 146 | mean.append(weight * self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index]) 147 | all_words.add(self.artist2vec_model.vocab[artist].index) 148 | else: 149 | raise KeyError("artist '%s' not in vocabulary" % artist) 150 | if not mean: 151 | raise ValueError("cannot compute similarity with no input") 152 | mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) 153 | limited = self.artist2vec_model.syn0norm if restrict_vocab is None \ 154 | else self.artist2vec_model.syn0norm[:restrict_vocab] 155 | # limited += self.artist2vec_model.syn0norm if restrict_vocab is None \ 156 | # else self.artist2vec_model.syn0norm[:restrict_vocab] 157 | dists = dot(limited, mean) 158 | if not topn: 159 | return dists 160 | best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) 161 | # ignore (don't return) words from the input 162 | result = [(self.artist2vec_model.index2word[sim], float(dists[sim])) for sim in best if 163 | sim not in all_words] 164 | return result[:topn] 165 | except Exception, e: 166 | print 'error = %s' % e 167 | raise e 168 | 169 | def cluster_song_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False): 170 | """ 171 | 获取单个歌单内的歌曲聚类信息 172 | Args: 173 | playlist_id: 歌单id 174 | cluster_n:聚类数 175 | is_detailed: 返回的结果是否包含详情 176 | 177 | Returns: 178 | 聚类后的列表 179 | """ 180 | playlist_obj = playlist_detail(playlist_id) 181 | song_list = [] 182 | vec_list = [] 183 | song_info_dict = {} 184 | ap_cluster = AffinityPropagation() 185 | data_process_logger.info('clustering playlist: %s' % playlist_obj['name']) 186 | for item in playlist_obj['tracks']: 187 | song = item['name'].lower() 188 | song_info_dict[song] = { 189 | 'name': song, 190 | 'artist': item['artists'][0]['name'], 191 | 'id': item['id'], 192 | 'album_img_url': item['album']['picUrl'], 193 | 'site_url': 'http://music.163.com/#/song?id=%s' % item['id'] 194 | } 195 | # print song 196 | if song not in song_list: 197 | song_list.append(song) 198 | # print self.song2vec_model.vocab.get(song) 199 | # print self.song2vec_model.syn0norm == None 200 | if self.song2vec_model.vocab.get(song) and len(self.song2vec_model.syn0norm): 201 | song_vec = self.song2vec_model.syn0norm[self.song2vec_model.vocab[song].index] 202 | else: 203 | data_process_logger.warn( 204 | 'The song %s of playlist-%s is not in dataset' % (song, playlist_obj['name'])) 205 | song_vec = [0 for i in range(self.song2vec_model.vector_size)] 206 | vec_list.append(song_vec) 207 | # song_list = list(song_list) 208 | if len(vec_list) > 1: 209 | cluster_result = ap_cluster.fit(vec_list, song_list) 210 | cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))] 211 | for i in range(len(cluster_result.labels_)): 212 | label = cluster_result.labels_[i] 213 | index = i 214 | cluster_array[label].append(song_list[i]) 215 | return cluster_array, playlist_obj['name'], song_info_dict 216 | else: 217 | return [song_list], playlist_obj['name'], song_info_dict 218 | 219 | def cluster_artist_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False): 220 | """ 221 | 获取单个歌单内的歌手聚类信息 222 | Args: 223 | playlist_id: 歌单id 224 | cluster_n:聚类数 225 | is_detailed: 是否包含详情信息 226 | 227 | Returns: 228 | 聚类后的列表 229 | """ 230 | playlist_obj = playlist_detail(playlist_id) 231 | artist_list = [] 232 | vec_list = [] 233 | ap_cluster = AffinityPropagation() 234 | data_process_logger.info('clustering playlist: %s' % playlist_obj['name']) 235 | for item in playlist_obj['tracks']: 236 | artist = item['artists'][0]['name'].lower() 237 | # print artist 238 | if artist not in artist_list: 239 | artist_list.append(artist) 240 | # print self.song2vec_model.vocab.get(artist) 241 | # print self.song2vec_model.syn0norm == None 242 | if self.artist2vec_model.vocab.get(artist) and len(self.artist2vec_model.syn0norm): 243 | artist_vec = self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index] 244 | else: 245 | data_process_logger.warn( 246 | 'The artist %s of playlist-%s is not in dataset' % (artist, playlist_obj['name'])) 247 | artist_vec = [0 for i in range(self.artist2vec_model.vector_size)] 248 | vec_list.append(artist_vec) 249 | # artist_list = list(artist_list) 250 | # vec_list = list(vec_list) 251 | if len(vec_list) > 1: 252 | cluster_result = ap_cluster.fit(vec_list, artist_list) 253 | cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))] 254 | for i in range(len(cluster_result.labels_)): 255 | label = cluster_result.labels_[i] 256 | index = i 257 | cluster_array[label].append(artist_list[i]) 258 | return cluster_array, playlist_obj['name'], {} 259 | else: 260 | return [artist_list], playlist_obj['name'], {} 261 | 262 | 263 | if __name__ == '__main__': 264 | s2vo = Song2VecOperator(song2vec_model_path='../datas/[full]50d_20iter_10win_5min_song2vec.model', 265 | artist2vec_model_path='../datas/[full]50d_20iter_10win_5min_artist2vec.model') 266 | # res = s2vo.calc_song_artist_similar(positive_songs=[u'time machine', u'yellow', u'viva la vida'], 267 | # negative_songs=[], 268 | # positive_artists=[], 269 | # negative_artists=[], 270 | # artist_weight=1.0, topn=20) 271 | # for i in res: 272 | # print i[0], i[1] 273 | s2vo.cluster_song_in_playlist('3659853') 274 | # s2vo.cluster_artist_in_playlist('3659853') 275 | -------------------------------------------------------------------------------- /song2vec/rock_gensim.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | """ 4 | Created by jayvee on 17/2/16. 5 | https://github.com/JayveeHe 6 | """ 7 | import random 8 | 9 | import gensim 10 | import pickle 11 | from gensim import corpora 12 | from gensim.models import word2vec 13 | 14 | from utils.cloudmusic_dao import CloudMusicDAO 15 | from utils.logger_utils import data_process_logger 16 | 17 | 18 | def prepare_song_dict(tag=''): 19 | """ 20 | 从数据库中遍历歌单,准备song2vec的训练数据 21 | Args: 22 | tag: 备注tag信息 23 | 24 | Returns: 25 | 26 | """ 27 | playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') 28 | print playlist_dao_inst.db_inst.find( 29 | {'trackCount': {'$gte': 3, '$lte': 1000}, 'playCount': {'$gte': 1}}, 30 | {'tracks': 1, 'name': 1}).limit(100000).count() 31 | find_result = playlist_dao_inst.db_inst.find( 32 | {'trackCount': {'$gte': 3, '$lte': 1000}, 'playCount': {'$gte': 1}}, 33 | {'tracks': 1, 'name': 1}).limit(100000) 34 | # 将歌单中的歌曲名组合成歌曲名序列 35 | total_song_set = [] 36 | count = 0 37 | for item in find_result: 38 | data_process_logger.info('No.%s %s' % (count, item['name'])) 39 | # 保存歌单中的歌曲序列 40 | song_seq = [] 41 | for song in item['tracks']: 42 | sname = song['name'] 43 | song_seq.append(sname.lower()) 44 | total_song_set.append(song_seq) 45 | count += 1 46 | data_process_logger.info('start building dictionary') 47 | song_dictionary = corpora.Dictionary(total_song_set) 48 | print u'歌单数', song_dictionary.num_docs 49 | print u'歌曲数', song_dictionary.num_pos 50 | data_process_logger.info('start saving datas') 51 | song_dictionary.save('../datas/song_dictionary_%s.dict' % tag) 52 | pickle.dump(total_song_set, open('../datas/songs_seq_%s.dat' % tag, 'wb')) 53 | return song_dictionary 54 | 55 | 56 | def train_song2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, 57 | iter_n=50): 58 | """ 59 | 训练song2vec模型 60 | Args: 61 | fout_path: 62 | input_datas: 63 | data_path: 64 | min_count: 65 | sorted_vocab: 66 | window: 67 | size: 68 | iter_n: 69 | 70 | Returns: 71 | 72 | """ 73 | if not input_datas and data_path: 74 | input_datas = pickle.load(open(data_path, 'rb')) 75 | data_process_logger.info('start training') 76 | random.shuffle(input_datas) 77 | input_datas = input_datas[:45000] 78 | wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window, 79 | size=size, iter=iter_n) 80 | with open(fout_path, 'wb') as fout: 81 | data_process_logger.info('start saving model') 82 | pickle.dump(wv_model, fout) 83 | print 'model saved' 84 | 85 | 86 | def prepare_artist_dict(tag=''): 87 | playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') 88 | # print playlist_dao_inst.db_inst.find( 89 | # {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 10}}, 90 | # {'name': 1}).limit(100000).count() 91 | find_result = playlist_dao_inst.db_inst.find( 92 | {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 5}}, 93 | {'tracks': 1, 'name': 1}).limit(100000) 94 | # 将歌单中的歌曲名组合成歌曲名序列 95 | total_artists_set = [] 96 | count = 0 97 | for item in find_result: 98 | data_process_logger.info('No.%s %s' % (count, item['name'])) 99 | # 保存歌单中的歌曲序列 100 | artists_seq = [] 101 | for song in item['tracks']: 102 | sname = song['artists'][0]['name'] 103 | artists_seq.append(sname.lower()) 104 | total_artists_set.append(artists_seq) 105 | count += 1 106 | data_process_logger.info('start building dictionary') 107 | artist_dictionary = corpora.Dictionary(total_artists_set) 108 | print u'歌单数', artist_dictionary.num_docs 109 | try: 110 | print u'歌手数', len(artist_dictionary.token2id) 111 | except Exception, e: 112 | print 'error = %s' % e 113 | data_process_logger.info('start saving datas') 114 | artist_dictionary.save('../datas/artists_dictionary_%s.dict' % tag) 115 | pickle.dump(total_artists_set, open('../datas/artists_seq_%s.dat' % tag, 'wb')) 116 | return artist_dictionary 117 | 118 | 119 | def train_artist2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, 120 | size=250, 121 | iter_n=50): 122 | if not input_datas and data_path: 123 | input_datas = pickle.load(open(data_path, 'rb')) 124 | data_process_logger.info('start training') 125 | wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window, 126 | size=size, iter=iter_n) 127 | with open(fout_path, 'wb') as fout: 128 | data_process_logger.info('start saving model') 129 | pickle.dump(wv_model, fout) 130 | print 'model saved' 131 | 132 | 133 | def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None, 134 | min_count=5, sorted_vocab=1, window=10, 135 | size=250, 136 | iter_n=50): 137 | if not input_datas and data_path: 138 | input_datas = pickle.load(open(data_path, 'rb')) 139 | full_data = [] 140 | for i in input_datas: 141 | tmp = [] 142 | for j in i: 143 | tmp.append(j[0]) 144 | tmp.append(j[1]) 145 | full_data.append(tmp) 146 | data_process_logger.info('start training') 147 | wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window, 148 | size=size, iter=iter_n) 149 | with open(fout_path, 'wb') as fout: 150 | data_process_logger.info('start saving model') 151 | pickle.dump(wv_model, fout) 152 | print 'model saved' 153 | 154 | 155 | def test_song2vec(): 156 | tag = 'full' 157 | # prepare_song_dict(tag=tag) 158 | min_count = 5 159 | sorted_vocab = 1 160 | window = 10 161 | size = 50 162 | iter_n = 20 163 | modelpath = '../datas/[%s_reduced]%sd_%siter_%swin_%smin_song2vec.model' % (tag, size, iter_n, window, min_count) 164 | train_song2vec_model(fout_path=modelpath, data_path='../datas/songs_seq_%s.dat' % tag, 165 | min_count=min_count, 166 | sorted_vocab=sorted_vocab, window=window, 167 | size=size, iter_n=iter_n) 168 | print 'model params:\tag: %s\tnmin: %s\twin: %s\tsize: %s\titer_n: %s' % (tag, min_count, window, size, iter_n) 169 | with open(modelpath, 'rb') as fin: 170 | data_process_logger.info('loading') 171 | m = pickle.load(fin) 172 | data_process_logger.info('start predicting') 173 | s1, s2 = u'半岛铁盒', u'成都'.lower() 174 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 175 | s1, s2 = u'viva la vida', u'yellow' 176 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 177 | s1, s2 = u'夜空中最亮的星', u'南山南' 178 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 179 | s1, s2 = u'photograph', u'need you now' 180 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 181 | print '---------------' 182 | tsong = u'告白气球' 183 | print u'%s 最相似的歌曲:' % tsong 184 | for i in m.most_similar(tsong, topn=20): 185 | print i[0], i[1] 186 | print '---------------' 187 | tsong = u'晴天'.lower() 188 | print u'%s 最相似的歌曲:' % tsong 189 | for i in m.most_similar(tsong, topn=20): 190 | print i[0], i[1] 191 | print '---------------' 192 | tsong = u'are you ok'.lower() 193 | print u'%s 最相似的歌曲:' % tsong 194 | for i in m.most_similar(tsong, topn=20): 195 | print i[0], i[1] 196 | print '---------------' 197 | tsong = u'How To Save A Life - New Album Version'.lower() 198 | print u'%s 最相似的歌曲:' % tsong 199 | for i in m.most_similar(tsong, topn=20): 200 | print i[0], i[1] 201 | print '---------------' 202 | tsong = u'往南'.lower() 203 | print u'%s 最相似的歌曲:' % tsong 204 | for i in m.most_similar(tsong, topn=20): 205 | print i[0], i[1] 206 | print '==============' 207 | add_arr = [u'晴天', u'雨天', u'欧若拉'] 208 | minus_arr = [u'说爱你'] 209 | line = '+'.join(add_arr) 210 | line += '-' + '-'.join(minus_arr) 211 | print line 212 | for i in m.most_similar(positive=add_arr, negative=minus_arr): 213 | print i[0], i[1] 214 | 215 | 216 | def test_artist2vec(): 217 | tag = 'full' 218 | min_count = 5 219 | sorted_vocab = 1 220 | window = 10 221 | size = 50 222 | iter_n = 20 223 | # prepare_artist_dict(tag=tag) 224 | modelpath = '../datas/[%s]%sd_%siter_%swin_%smin_artist2vec.model' % (tag, size, iter_n, window, min_count) 225 | print 'model params:\tag: %s\tnmin: %s\twin: %s\tsize: %s\titer_n: %s' % (tag, min_count, window, size, iter_n) 226 | # train_artist2vec_model(fout_path=modelpath, data_path='../datas/artists_seq_%s.dat' % tag, 227 | # min_count=min_count, 228 | # sorted_vocab=sorted_vocab, window=window, 229 | # size=size, iter_n=iter_n) 230 | with open(modelpath, 'rb') as fin: 231 | m = pickle.load(fin) 232 | s1, s2 = u'周杰伦', u'王力宏'.lower() 233 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 234 | s1, s2 = u'蔡依林', u'梁静茹' 235 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 236 | s1, s2 = u'梁静茹', u'孙燕姿' 237 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 238 | print '---------------' 239 | tsong = u'老狼' 240 | print u'%s 最相似的歌手:' % tsong 241 | for i in m.most_similar(tsong, topn=20): 242 | print i[0], i[1] 243 | print '---------------' 244 | tsong = u'周杰伦'.lower() 245 | print u'%s 最相似的歌手:' % tsong 246 | for i in m.most_similar(tsong, topn=20): 247 | print i[0], i[1] 248 | print '---------------' 249 | tsong = u'蔡依林'.lower() 250 | print u'%s 最相似的歌手:' % tsong 251 | for i in m.most_similar(tsong, topn=20): 252 | print i[0], i[1] 253 | print '---------------' 254 | tsong = u's.h.e'.lower() 255 | print u'%s 最相似的歌手:' % tsong 256 | for i in m.most_similar(tsong, topn=20): 257 | print i[0], i[1] 258 | print '---------------' 259 | tsong = u'spyair'.lower() 260 | print u'%s 最相似的歌手:' % tsong 261 | for i in m.most_similar(tsong, topn=20): 262 | print i[0], i[1] 263 | print '==============' 264 | # add_arr = [u'晴天', u'布拉格广场', u'去大理'] 265 | # minus_arr = [u'faded'.lower(), u'时间煮雨', u'爱的供养'] 266 | # line = '+'.join(add_arr) 267 | # line += '-' + '-'.join(minus_arr) 268 | # print line 269 | # for i in m.most_similar(positive=add_arr, negative=minus_arr): 270 | # print i[0], i[1] 271 | 272 | 273 | def prepare_song_artist_dict(tag=''): 274 | playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') 275 | print playlist_dao_inst.db_inst.find( 276 | {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}}, 277 | {'tracks': 1, 'name': 1}).limit(100000).count() 278 | find_result = playlist_dao_inst.db_inst.find( 279 | {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}}, 280 | {'tracks': 1, 'name': 1}).limit(100000) 281 | # 将歌单中的歌曲名组合成歌曲名序列 282 | total_song_artist_set = [] 283 | count = 0 284 | for item in find_result: 285 | data_process_logger.info('No.%s %s' % (count, item['name'])) 286 | # 保存歌单中的歌曲序列 287 | song_artist_seq = [] 288 | for song in item['tracks']: 289 | sname = song['name'] 290 | artist = song['artists'][0]['name'].lower() 291 | song_artist_seq.append((sname.lower(), artist)) 292 | total_song_artist_set.append(song_artist_seq) 293 | count += 1 294 | data_process_logger.info('start building dictionary') 295 | # song_dictionary = corpora.Dictionary(total_song_artist_set) 296 | # print u'歌单数', song_dictionary.num_docs 297 | # print u'歌曲数', song_dictionary.num_pos 298 | data_process_logger.info('start saving datas') 299 | # song_dictionary.save('../datas/song_artist_dictionary_%s.dict' % tag) 300 | pickle.dump(total_song_artist_set, open('../datas/songs_artists_seq_%s.dat' % tag, 'wb')) 301 | # return song_dictionary 302 | 303 | 304 | def test_artistsong2vec(): 305 | tag = 'full' 306 | min_count = 5 307 | sorted_vocab = 1 308 | window = 10 309 | size = 50 310 | iter_n = 20 311 | # prepare_artist_dict(tag=tag) 312 | modelpath = '../datas/[%s]%sd_%siter_%swin_%smin_artistsong2vec.model' % (tag, size, iter_n, window, min_count) 313 | print 'model params:\tag: %s\tnmin: %s\twin: %s\tsize: %s\titer_n: %s' % (tag, min_count, window, size, iter_n) 314 | # train_artistsong2vec_model(fout_path=modelpath, data_path='../datas/songs_artists_seq_%s.dat' % tag) 315 | with open(modelpath, 'rb') as fin: 316 | m = pickle.load(fin) 317 | s1, s2 = u'周杰伦', u'蔡依林'.lower() 318 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 319 | s1, s2 = u'周杰伦', u'东风破' 320 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 321 | s1, s2 = u'梁静茹', u'孙燕姿' 322 | print u'%s 与 %s 的相似度为: %.4f' % (s1, s2, m.similarity(s1, s2)) 323 | print '---------------' 324 | tsong = u'你听得到' 325 | print u'%s 最相似的歌手:' % tsong 326 | for i in m.most_similar(tsong, topn=20): 327 | print i[0], i[1] 328 | print '---------------' 329 | tsong = u'周杰伦'.lower() 330 | print u'%s 最相似的歌手:' % tsong 331 | for i in m.most_similar(tsong, topn=20): 332 | print i[0], i[1] 333 | print '---------------' 334 | tsong = u'蔡依林'.lower() 335 | print u'%s 最相似的歌手:' % tsong 336 | for i in m.most_similar(tsong, topn=20): 337 | print i[0], i[1] 338 | print '---------------' 339 | tsong = u'雷军'.lower() 340 | print u'%s 最相似的歌手:' % tsong 341 | for i in m.most_similar(tsong, topn=20): 342 | print i[0], i[1] 343 | print '---------------' 344 | tsong = u'王力宏'.lower() 345 | print u'%s 最相似的歌手:' % tsong 346 | for i in m.most_similar_cosmul(tsong, topn=20): 347 | print i[0], i[1] 348 | print '==============' 349 | add_arr = [u'周杰伦', u'王力宏', u'王力宏'] 350 | minus_arr = [u'晴天', u'回到过去'] 351 | line = '+'.join(add_arr) 352 | line += '-' + '-'.join(minus_arr) 353 | print line 354 | for i in m.most_similar(positive=add_arr, negative=minus_arr): 355 | print i[0], i[1] 356 | 357 | 358 | if __name__ == '__main__': 359 | test_song2vec() 360 | # test_artist2vec() 361 | # prepare_song_artist_dict('full') 362 | # test_artistsong2vec() 363 | -------------------------------------------------------------------------------- /api_server/templates/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | MusicTaster 6 | 7 | 8 | 9 | {# #} 10 | {# #} 11 | {# #} 12 | {# #} 13 | {# #} 14 | {# #} 15 | 16 | 17 | 18 | {# #} 19 | 20 | 33 | 34 | 35 | 36 |
37 |
38 |
39 |

40 | MusicTaster 41 |

42 | 43 |

44 | A demo based on Song2Vec and Artist2Vec 45 |

46 |

47 | 详情见博客《MusicTaster——一种Song2Vec和Artist2Vec的实践》 49 |

50 |

51 | 欢迎Star@Github https://github.com/JayveeHe/MusicTaster 53 |

54 |
55 |
56 |
57 | {#
#} 58 | {#
#} 59 | {#
#} 60 | {# 摘要句数:#} 61 | {#
#} 62 | {#
#} 63 | {# #} 64 | {#
#} 65 | {#
#} 66 | {# #} 67 | {#
#} 68 | {#
#} 69 | {#
#} 70 |
71 |
72 |
73 |
74 |
75 | 歌单聚类: 76 |
77 | 请输入网易云音乐的歌单地址 78 |
79 | 81 | 82 |
83 |
84 | 85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 | 歌曲风格: 97 |
98 | 找出与给定歌曲(可多个)的风格近似的其他歌曲 99 |
100 |
101 |
102 |
103 | 105 | 结果数: 106 |
107 | 108 |
109 |
110 |
111 |
112 | 145 | 176 |
177 | 392 | 441 | 442 | 443 | --------------------------------------------------------------------------------