├── crawler ├── msn │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── msn_spider.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── middlewares.py ├── scrapy.cfg ├── README.md └── environment.yaml ├── .gitignore ├── prediction.zip ├── MSR License_Data.pdf ├── MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf ├── README.md └── evaluate.py /crawler/msn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | -------------------------------------------------------------------------------- /prediction.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msnews/MIND/HEAD/prediction.zip -------------------------------------------------------------------------------- /MSR License_Data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msnews/MIND/HEAD/MSR License_Data.pdf -------------------------------------------------------------------------------- /MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msnews/MIND/HEAD/MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf -------------------------------------------------------------------------------- /crawler/msn/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MIND 2 | Microsoft News Dataset is a large-Scale English dataset for news recommendation research. This repository provides the script to evaluate models and a sample submission for MIND News Recommendation Challenge. -------------------------------------------------------------------------------- /crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = msn.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = msn 12 | -------------------------------------------------------------------------------- /crawler/msn/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class NewsItem(scrapy.Item): 11 | nid = scrapy.Field() 12 | body = scrapy.Field() 13 | pass 14 | 15 | -------------------------------------------------------------------------------- /crawler/msn/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TutorialPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /crawler/README.md: -------------------------------------------------------------------------------- 1 | # MSN News Crawler 2 | 3 | This is a crawler to crawl news features from MSN news website such as body, title, vert and subvert, giving serveral news urls. 4 | 5 | Build with scrapy 2.0 and python3.7: 6 | 7 | `https://docs.scrapy.org/en/latest/intro/tutorial.html` 8 | 9 | 10 | The conda enviroment is exported to `enviroments.yaml`. To create a conda enviroment, please run: 11 | 12 | ```bash 13 | conda env create -f environment.yaml 14 | conda activate scrapy 15 | ``` 16 | 17 | Set environment varialble `MIND_NEWS_PATH` to the path of your `news.tsv` 18 | 19 | ```bash 20 | export MIND_NEWS_PATH=[your path] 21 | ``` 22 | 23 | Run the crawler: 24 | 25 | ```bash 26 | scrapy crawl msn -o msn.json 27 | ``` 28 | 29 | The crawled urls are in the `start_url` list of `msn/spiders/msn_spider.py`. You can add more urls here. 30 | 31 | To add more news features, please edit `items.py` and `msn_spider.py`. 32 | 33 | -------------------------------------------------------------------------------- /crawler/msn/spiders/msn_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from urllib.parse import unquote 3 | from ..items import NewsItem 4 | import os 5 | import json 6 | 7 | class MSNSpider(scrapy.Spider): 8 | name = "msn" 9 | allowed_domains = ["msn.com"] 10 | 11 | start_urls = [] 12 | with open(os.environ["MIND_NEWS_PATH"], 'r') as f: 13 | for l in f: 14 | _, _, _, _, _, url, _, _ = l.strip('\n').split('\t') 15 | start_urls.append(url) 16 | 17 | # start_urls = [ 18 | # # ss 19 | # "https://mind201910small.blob.core.windows.net/archive/AAGH0ET.html", 20 | # # ar 21 | # "https://mind201910small.blob.core.windows.net/archive/AABmf2I.html", 22 | # # vi 23 | # "https://mind201910small.blob.core.windows.net/archive/AAI33em.html" 24 | # ] 25 | 26 | def __init__(self): 27 | with open('./doc_type.json', 'r') as f: 28 | self.doc_type = json.load(f) 29 | 30 | super().__init__() 31 | 32 | def parse(self, response): 33 | 34 | url = unquote(response.url) 35 | item = NewsItem() 36 | # parse nid, vert and subvert 37 | nid_type = self.parse_nid_from_url(item, url) 38 | 39 | # parse body from response 40 | self.parse_body(response, item, nid_type) 41 | 42 | yield item 43 | 44 | def parse_nid_from_url(self, item, url): 45 | item['nid'] = url.split('/')[-1].split('.')[-2] 46 | return self.doc_type[item['nid']] 47 | 48 | def parse_body(self, response, item, nid_type): 49 | 50 | # if metadate contains description take it as the first sentence 51 | # body_desc = response.xpath('//meta[@name="description"]/@content')[0].extract() 52 | 53 | # type1: ar-nid 54 | if nid_type == 'ar': 55 | body = response.xpath('//p/text()').getall() 56 | 57 | # type2: ss 58 | if nid_type == 'ss': 59 | body = response.xpath('//div[@class="gallery-caption-text"]//text()').getall() 60 | 61 | # type3: vi 62 | if nid_type == 'vi': 63 | body = response.xpath('//div[@class="video-description"]//text()').getall() 64 | 65 | item['body'] = body 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /crawler/environment.yaml: -------------------------------------------------------------------------------- 1 | name: scrapy 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - attrs=19.3.0=py_0 8 | - automat=20.2.0=py_0 9 | - backcall=0.1.0=py37_0 10 | - bcrypt=3.1.7=py37h516909a_0 11 | - ca-certificates=2020.1.1=0 12 | - certifi=2019.11.28=py37_0 13 | - cffi=1.14.0=py37hd463f26_0 14 | - constantly=15.1.0=py_0 15 | - cryptography=2.8=py37h72c5cf5_1 16 | - cssselect=1.1.0=py_0 17 | - decorator=4.4.1=py_0 18 | - hyperlink=17.3.1=py_0 19 | - icu=64.2=he1b5a44_1 20 | - idna=2.9=py_1 21 | - importlib_metadata=1.5.0=py37_0 22 | - incremental=17.5.0=py_0 23 | - ipython=7.12.0=py37h5ca1d4c_0 24 | - ipython_genutils=0.2.0=py37_0 25 | - jedi=0.16.0=py37_0 26 | - ld_impl_linux-64=2.33.1=h53a641e_7 27 | - libedit=3.1.20181209=hc058e9b_0 28 | - libffi=3.2.1=hd88cf55_4 29 | - libgcc-ng=9.1.0=hdf63c60_0 30 | - libiconv=1.15=h516909a_1005 31 | - libstdcxx-ng=9.1.0=hdf63c60_0 32 | - libxml2=2.9.10=hee79883_0 33 | - libxslt=1.1.33=h31b3aaa_0 34 | - lxml=4.5.0=py37h7ec2d77_0 35 | - more-itertools=8.2.0=py_0 36 | - ncurses=6.2=he6710b0_0 37 | - openssl=1.1.1d=h7b6447c_4 38 | - packaging=20.1=py_0 39 | - parsel=1.5.2=py37_0 40 | - parso=0.6.1=py_0 41 | - pexpect=4.8.0=py37_0 42 | - pickleshare=0.7.5=py37_0 43 | - pip=20.0.2=py37_1 44 | - pluggy=0.12.0=py_0 45 | - prompt_toolkit=3.0.3=py_0 46 | - protego=0.1.16=py_0 47 | - ptyprocess=0.6.0=py37_0 48 | - py=1.8.1=py_0 49 | - pyasn1=0.4.8=py_0 50 | - pyasn1-modules=0.2.7=py_0 51 | - pycparser=2.20=py_0 52 | - pydispatcher=2.0.5=py_1 53 | - pygments=2.5.2=py_0 54 | - pyhamcrest=2.0.2=py_0 55 | - pyopenssl=19.1.0=py_1 56 | - pyparsing=2.4.6=py_0 57 | - pytest=5.3.5=py37_1 58 | - pytest-runner=5.2=py_0 59 | - python=3.7.6=h0371630_2 60 | - queuelib=1.5.0=py37_0 61 | - readline=7.0=h7b6447c_5 62 | - scrapy=2.0.0=py37_0 63 | - service_identity=18.1.0=py_0 64 | - setuptools=45.2.0=py37_0 65 | - six=1.14.0=py37_0 66 | - sqlite=3.31.1=h7b6447c_0 67 | - tk=8.6.8=hbc83047_0 68 | - traitlets=4.3.3=py37_0 69 | - twisted=19.10.0=py37h516909a_0 70 | - w3lib=1.20.0=py_0 71 | - wcwidth=0.1.8=py_0 72 | - wheel=0.34.2=py37_0 73 | - xz=5.2.4=h14c3975_4 74 | - zipp=3.1.0=py_0 75 | - zlib=1.2.11=h7b6447c_3 76 | - zope.interface=4.7.1=py37h516909a_0 77 | prefix: /home/v-jinyi/.conda/envs/scrapy 78 | 79 | -------------------------------------------------------------------------------- /crawler/msn/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutorial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'mns_news_spider' 13 | 14 | SPIDER_MODULES = ['msn.spiders'] 15 | NEWSPIDER_MODULE = 'msn.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 0.5 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tutorial.middlewares.TutorialDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'tutorial.pipelines.TutorialPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | REDIRECT_ENABLED = False -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os, os.path 3 | import numpy as np 4 | import json 5 | from sklearn.metrics import roc_auc_score 6 | 7 | def dcg_score(y_true, y_score, k=10): 8 | order = np.argsort(y_score)[::-1] 9 | y_true = np.take(y_true, order[:k]) 10 | gains = 2 ** y_true - 1 11 | discounts = np.log2(np.arange(len(y_true)) + 2) 12 | return np.sum(gains / discounts) 13 | 14 | 15 | def ndcg_score(y_true, y_score, k=10): 16 | best = dcg_score(y_true, y_true, k) 17 | actual = dcg_score(y_true, y_score, k) 18 | return actual / best 19 | 20 | 21 | def mrr_score(y_true, y_score): 22 | order = np.argsort(y_score)[::-1] 23 | y_true = np.take(y_true, order) 24 | rr_score = y_true / (np.arange(len(y_true)) + 1) 25 | return np.sum(rr_score) / np.sum(y_true) 26 | 27 | def parse_line(l): 28 | impid, ranks = l.strip('\n').split() 29 | ranks = json.loads(ranks) 30 | return impid, ranks 31 | 32 | def scoring(truth_f, sub_f): 33 | aucs = [] 34 | mrrs = [] 35 | ndcg5s = [] 36 | ndcg10s = [] 37 | 38 | line_index = 1 39 | for lt in truth_f: 40 | ls = sub_f.readline() 41 | impid, labels = parse_line(lt) 42 | 43 | # ignore masked impressions 44 | if labels == []: 45 | continue 46 | 47 | if ls == '': 48 | # empty line: filled with 0 ranks 49 | sub_impid = impid 50 | sub_ranks = [1] * len(labels) 51 | else: 52 | try: 53 | sub_impid, sub_ranks = parse_line(ls) 54 | except: 55 | raise ValueError("line-{}: Invalid Input Format!".format(line_index)) 56 | 57 | if sub_impid != impid: 58 | raise ValueError("line-{}: Inconsistent Impression Id {} and {}".format( 59 | line_index, 60 | sub_impid, 61 | impid 62 | )) 63 | 64 | lt_len = float(len(labels)) 65 | 66 | y_true = np.array(labels,dtype='float32') 67 | y_score = [] 68 | for rank in sub_ranks: 69 | score_rslt = 1./rank 70 | if score_rslt < 0 or score_rslt > 1: 71 | raise ValueError("Line-{}: score_rslt should be int from 0 to {}".format( 72 | line_index, 73 | lt_len 74 | )) 75 | y_score.append(score_rslt) 76 | 77 | auc = roc_auc_score(y_true,y_score) 78 | mrr = mrr_score(y_true,y_score) 79 | ndcg5 = ndcg_score(y_true,y_score,5) 80 | ndcg10 = ndcg_score(y_true,y_score,10) 81 | 82 | aucs.append(auc) 83 | mrrs.append(mrr) 84 | ndcg5s.append(ndcg5) 85 | ndcg10s.append(ndcg10) 86 | 87 | line_index += 1 88 | 89 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 90 | 91 | 92 | if __name__ == '__main__': 93 | input_dir = sys.argv[1] 94 | output_dir = sys.argv[2] 95 | 96 | submit_dir = os.path.join(input_dir, 'res') 97 | truth_dir = os.path.join(input_dir, 'ref') 98 | 99 | if not os.path.isdir(submit_dir): 100 | print("%s doesn't exist" % submit_dir) 101 | 102 | if os.path.isdir(submit_dir) and os.path.isdir(truth_dir): 103 | if not os.path.exists(output_dir): 104 | os.makedirs(output_dir) 105 | 106 | output_filename = os.path.join(output_dir, 'scores.txt') 107 | output_file = open(output_filename, 'w') 108 | 109 | truth_file = open(os.path.join(truth_dir, "truth.txt"), 'r') 110 | submission_answer_file = open(os.path.join(submit_dir, "prediction.txt"), 'r') 111 | 112 | auc, mrr, ndcg, ndcg10 = scoring(truth_file, submission_answer_file) 113 | 114 | output_file.write("AUC:{:.4f}\nMRR:{:.4f}\nnDCG@5:{:.4f}\nnDCG@10:{:.4f}".format(auc, mrr, ndcg, ndcg10)) 115 | output_file.close() -------------------------------------------------------------------------------- /crawler/msn/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TutorialSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TutorialDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | --------------------------------------------------------------------------------