├── crawler
    ├── msn
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── msn_spider.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── middlewares.py
    ├── scrapy.cfg
    ├── README.md
    └── environment.yaml
├── .gitignore
├── prediction.zip
├── MSR License_Data.pdf
├── MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf
├── README.md
└── evaluate.py


/crawler/msn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 


--------------------------------------------------------------------------------
/prediction.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msnews/MIND/HEAD/prediction.zip


--------------------------------------------------------------------------------
/MSR License_Data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msnews/MIND/HEAD/MSR License_Data.pdf


--------------------------------------------------------------------------------
/MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msnews/MIND/HEAD/MICROSOFT MIND NEWS RECOMMENDATION CONTEST.pdf


--------------------------------------------------------------------------------
/crawler/msn/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MIND
2 | Microsoft News Dataset is a large-Scale English dataset for news recommendation research. This repository provides the script to evaluate models and a sample submission for MIND News Recommendation Challenge.


--------------------------------------------------------------------------------
/crawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = msn.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = msn
12 | 


--------------------------------------------------------------------------------
/crawler/msn/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class NewsItem(scrapy.Item):
11 |     nid = scrapy.Field()
12 |     body = scrapy.Field()
13 |     pass
14 | 
15 | 


--------------------------------------------------------------------------------
/crawler/msn/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TutorialPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/crawler/README.md:
--------------------------------------------------------------------------------
 1 | # MSN News Crawler
 2 | 
 3 | This is a crawler to crawl news features from MSN news website such as body, title, vert and subvert, giving serveral news urls.
 4 | 
 5 | Build with scrapy 2.0 and python3.7:
 6 | 
 7 | `https://docs.scrapy.org/en/latest/intro/tutorial.html`
 8 | 
 9 | 
10 | The conda enviroment is exported to `enviroments.yaml`. To create a conda enviroment, please run:
11 | 
12 | ```bash
13 | conda env create -f environment.yaml
14 | conda activate scrapy
15 | ```
16 | 
17 | Set environment varialble `MIND_NEWS_PATH` to the path of your `news.tsv`
18 | 
19 | ```bash
20 | export MIND_NEWS_PATH=[your path]
21 | ```
22 | 
23 | Run the crawler:
24 | 
25 | ```bash
26 | scrapy crawl msn -o msn.json
27 | ```
28 | 
29 | The crawled urls are in the `start_url` list of `msn/spiders/msn_spider.py`. You can add more urls here.
30 | 
31 | To add more news features, please edit `items.py` and `msn_spider.py`.
32 | 
33 | 


--------------------------------------------------------------------------------
/crawler/msn/spiders/msn_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from urllib.parse import unquote
 3 | from ..items import NewsItem
 4 | import os
 5 | import json
 6 | 
 7 | class MSNSpider(scrapy.Spider):
 8 |     name = "msn"
 9 |     allowed_domains = ["msn.com"]
10 | 
11 |     start_urls = []
12 |     with open(os.environ["MIND_NEWS_PATH"], 'r') as f:
13 |         for l in f:
14 |             _, _, _, _, _, url, _, _ = l.strip('\n').split('\t')
15 |             start_urls.append(url)
16 | 
17 |     # start_urls = [
18 |     #     # ss
19 |     #     "https://mind201910small.blob.core.windows.net/archive/AAGH0ET.html",
20 |     #     # ar
21 |     #     "https://mind201910small.blob.core.windows.net/archive/AABmf2I.html",
22 |     #     # vi
23 |     #     "https://mind201910small.blob.core.windows.net/archive/AAI33em.html"
24 |     # ]
25 | 
26 |     def __init__(self):
27 |         with open('./doc_type.json', 'r') as f:
28 |             self.doc_type = json.load(f)
29 | 
30 |         super().__init__()
31 | 
32 |     def parse(self, response):
33 | 
34 |         url = unquote(response.url)
35 |         item = NewsItem()
36 |         # parse nid, vert and subvert
37 |         nid_type = self.parse_nid_from_url(item, url)
38 | 
39 |         # parse body from response
40 |         self.parse_body(response, item, nid_type)
41 | 
42 |         yield item
43 | 
44 |     def parse_nid_from_url(self, item, url):
45 |         item['nid'] = url.split('/')[-1].split('.')[-2]
46 |         return self.doc_type[item['nid']]
47 | 
48 |     def parse_body(self, response, item, nid_type):
49 | 
50 |         # if metadate contains description take it as the first sentence
51 |         # body_desc = response.xpath('//meta[@name="description"]/@content')[0].extract()
52 | 
53 |         # type1: ar-nid
54 |         if nid_type == 'ar':
55 |             body = response.xpath('//p/text()').getall()
56 | 
57 |         # type2: ss
58 |         if nid_type == 'ss':
59 |             body = response.xpath('//div[@class="gallery-caption-text"]//text()').getall()
60 | 
61 |         # type3: vi
62 |         if nid_type == 'vi':
63 |             body = response.xpath('//div[@class="video-description"]//text()').getall()
64 | 
65 |         item['body'] = body
66 |         
67 | 
68 |          
69 |         


--------------------------------------------------------------------------------
/crawler/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: scrapy
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - attrs=19.3.0=py_0
 8 |   - automat=20.2.0=py_0
 9 |   - backcall=0.1.0=py37_0
10 |   - bcrypt=3.1.7=py37h516909a_0
11 |   - ca-certificates=2020.1.1=0
12 |   - certifi=2019.11.28=py37_0
13 |   - cffi=1.14.0=py37hd463f26_0
14 |   - constantly=15.1.0=py_0
15 |   - cryptography=2.8=py37h72c5cf5_1
16 |   - cssselect=1.1.0=py_0
17 |   - decorator=4.4.1=py_0
18 |   - hyperlink=17.3.1=py_0
19 |   - icu=64.2=he1b5a44_1
20 |   - idna=2.9=py_1
21 |   - importlib_metadata=1.5.0=py37_0
22 |   - incremental=17.5.0=py_0
23 |   - ipython=7.12.0=py37h5ca1d4c_0
24 |   - ipython_genutils=0.2.0=py37_0
25 |   - jedi=0.16.0=py37_0
26 |   - ld_impl_linux-64=2.33.1=h53a641e_7
27 |   - libedit=3.1.20181209=hc058e9b_0
28 |   - libffi=3.2.1=hd88cf55_4
29 |   - libgcc-ng=9.1.0=hdf63c60_0
30 |   - libiconv=1.15=h516909a_1005
31 |   - libstdcxx-ng=9.1.0=hdf63c60_0
32 |   - libxml2=2.9.10=hee79883_0
33 |   - libxslt=1.1.33=h31b3aaa_0
34 |   - lxml=4.5.0=py37h7ec2d77_0
35 |   - more-itertools=8.2.0=py_0
36 |   - ncurses=6.2=he6710b0_0
37 |   - openssl=1.1.1d=h7b6447c_4
38 |   - packaging=20.1=py_0
39 |   - parsel=1.5.2=py37_0
40 |   - parso=0.6.1=py_0
41 |   - pexpect=4.8.0=py37_0
42 |   - pickleshare=0.7.5=py37_0
43 |   - pip=20.0.2=py37_1
44 |   - pluggy=0.12.0=py_0
45 |   - prompt_toolkit=3.0.3=py_0
46 |   - protego=0.1.16=py_0
47 |   - ptyprocess=0.6.0=py37_0
48 |   - py=1.8.1=py_0
49 |   - pyasn1=0.4.8=py_0
50 |   - pyasn1-modules=0.2.7=py_0
51 |   - pycparser=2.20=py_0
52 |   - pydispatcher=2.0.5=py_1
53 |   - pygments=2.5.2=py_0
54 |   - pyhamcrest=2.0.2=py_0
55 |   - pyopenssl=19.1.0=py_1
56 |   - pyparsing=2.4.6=py_0
57 |   - pytest=5.3.5=py37_1
58 |   - pytest-runner=5.2=py_0
59 |   - python=3.7.6=h0371630_2
60 |   - queuelib=1.5.0=py37_0
61 |   - readline=7.0=h7b6447c_5
62 |   - scrapy=2.0.0=py37_0
63 |   - service_identity=18.1.0=py_0
64 |   - setuptools=45.2.0=py37_0
65 |   - six=1.14.0=py37_0
66 |   - sqlite=3.31.1=h7b6447c_0
67 |   - tk=8.6.8=hbc83047_0
68 |   - traitlets=4.3.3=py37_0
69 |   - twisted=19.10.0=py37h516909a_0
70 |   - w3lib=1.20.0=py_0
71 |   - wcwidth=0.1.8=py_0
72 |   - wheel=0.34.2=py37_0
73 |   - xz=5.2.4=h14c3975_4
74 |   - zipp=3.1.0=py_0
75 |   - zlib=1.2.11=h7b6447c_3
76 |   - zope.interface=4.7.1=py37h516909a_0
77 | prefix: /home/v-jinyi/.conda/envs/scrapy
78 | 
79 | 


--------------------------------------------------------------------------------
/crawler/msn/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tutorial project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'mns_news_spider'
13 | 
14 | SPIDER_MODULES = ['msn.spiders']
15 | NEWSPIDER_MODULE = 'msn.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 0.5
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'tutorial.pipelines.TutorialPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | REDIRECT_ENABLED = False


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os, os.path
  3 | import numpy as np
  4 | import json
  5 | from sklearn.metrics import roc_auc_score
  6 | 
  7 | def dcg_score(y_true, y_score, k=10):
  8 |     order = np.argsort(y_score)[::-1]
  9 |     y_true = np.take(y_true, order[:k])
 10 |     gains = 2 ** y_true - 1
 11 |     discounts = np.log2(np.arange(len(y_true)) + 2)
 12 |     return np.sum(gains / discounts)
 13 |     
 14 | 
 15 | def ndcg_score(y_true, y_score, k=10):
 16 |     best = dcg_score(y_true, y_true, k)
 17 |     actual = dcg_score(y_true, y_score, k)
 18 |     return actual / best
 19 | 
 20 | 
 21 | def mrr_score(y_true, y_score):
 22 |     order = np.argsort(y_score)[::-1]
 23 |     y_true = np.take(y_true, order)
 24 |     rr_score = y_true / (np.arange(len(y_true)) + 1)
 25 |     return np.sum(rr_score) / np.sum(y_true)
 26 | 
 27 | def parse_line(l):
 28 |     impid, ranks = l.strip('\n').split()
 29 |     ranks = json.loads(ranks)
 30 |     return impid, ranks
 31 | 
 32 | def scoring(truth_f, sub_f):
 33 |     aucs = []
 34 |     mrrs = []
 35 |     ndcg5s = []
 36 |     ndcg10s = []
 37 |     
 38 |     line_index = 1
 39 |     for lt in truth_f:
 40 |         ls = sub_f.readline()
 41 |         impid, labels = parse_line(lt)
 42 |         
 43 |         # ignore masked impressions
 44 |         if labels == []:
 45 |             continue 
 46 |         
 47 |         if ls == '':
 48 |             # empty line: filled with 0 ranks
 49 |             sub_impid = impid
 50 |             sub_ranks = [1] * len(labels)
 51 |         else:
 52 |             try:
 53 |                 sub_impid, sub_ranks = parse_line(ls)
 54 |             except:
 55 |                 raise ValueError("line-{}: Invalid Input Format!".format(line_index))       
 56 |         
 57 |         if sub_impid != impid:
 58 |             raise ValueError("line-{}: Inconsistent Impression Id {} and {}".format(
 59 |                 line_index,
 60 |                 sub_impid,
 61 |                 impid
 62 |             ))        
 63 |         
 64 |         lt_len = float(len(labels))
 65 |         
 66 |         y_true =  np.array(labels,dtype='float32')
 67 |         y_score = []
 68 |         for rank in sub_ranks:
 69 |             score_rslt = 1./rank
 70 |             if score_rslt < 0 or score_rslt > 1:
 71 |                 raise ValueError("Line-{}: score_rslt should be int from 0 to {}".format(
 72 |                     line_index,
 73 |                     lt_len
 74 |                 ))
 75 |             y_score.append(score_rslt)
 76 |             
 77 |         auc = roc_auc_score(y_true,y_score)
 78 |         mrr = mrr_score(y_true,y_score)
 79 |         ndcg5 = ndcg_score(y_true,y_score,5)
 80 |         ndcg10 = ndcg_score(y_true,y_score,10)
 81 |         
 82 |         aucs.append(auc)
 83 |         mrrs.append(mrr)
 84 |         ndcg5s.append(ndcg5)
 85 |         ndcg10s.append(ndcg10)
 86 |         
 87 |         line_index += 1
 88 | 
 89 |     return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
 90 |         
 91 | 
 92 | if __name__ == '__main__':
 93 |     input_dir = sys.argv[1]
 94 |     output_dir = sys.argv[2]
 95 | 
 96 |     submit_dir = os.path.join(input_dir, 'res') 
 97 |     truth_dir = os.path.join(input_dir, 'ref')
 98 | 
 99 |     if not os.path.isdir(submit_dir):
100 |         print("%s doesn't exist" % submit_dir)
101 | 
102 |     if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):
103 |         if not os.path.exists(output_dir):
104 |             os.makedirs(output_dir)
105 | 
106 |         output_filename = os.path.join(output_dir, 'scores.txt')              
107 |         output_file = open(output_filename, 'w')
108 | 
109 |         truth_file = open(os.path.join(truth_dir, "truth.txt"), 'r')
110 |         submission_answer_file = open(os.path.join(submit_dir, "prediction.txt"), 'r')
111 |         
112 |         auc, mrr, ndcg, ndcg10 = scoring(truth_file, submission_answer_file)
113 | 
114 |         output_file.write("AUC:{:.4f}\nMRR:{:.4f}\nnDCG@5:{:.4f}\nnDCG@10:{:.4f}".format(auc, mrr, ndcg, ndcg10))
115 |         output_file.close()


--------------------------------------------------------------------------------
/crawler/msn/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TutorialSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class TutorialDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------