├── doc
    ├── img1.png
    ├── img2.png
    ├── img3.png
    └── img4.png
├── scholar_server.py
├── README.md
└── scholar_to_rss.py


/doc/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img1.png


--------------------------------------------------------------------------------
/doc/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img2.png


--------------------------------------------------------------------------------
/doc/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img3.png


--------------------------------------------------------------------------------
/doc/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img4.png


--------------------------------------------------------------------------------
/scholar_server.py:
--------------------------------------------------------------------------------
 1 | import http.server
 2 | import socketserver
 3 | 
 4 | port = 9278
 5 | 
 6 | Handler = http.server.SimpleHTTPRequestHandler
 7 | 
 8 | with socketserver.TCPServer(("", port), Handler) as httpd:
 9 |     print(f"Serving at port {port}")
10 |     httpd.serve_forever()
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | Use Outlook email to receive Google Scholar Alert emails and parse them into an RSS feed.
 4 | 
 5 | ## Changelog
 6 | 
 7 | - `2024-12-30`: You have to register for an [Azure](https://azure.microsoft.com/) account (free) to get started with Microsoft Entra ID.
 8 | 
 9 | ## Create an azure account
10 | 
11 | - https://azure.microsoft.com/
12 | 
13 | ## MS_GRAPH token (Outlook)
14 | 
15 | Creat app：https://entra.microsoft.com/#home
16 | 
17 | ![img1](./doc/img1.png)
18 | ![img1](./doc/img2.png)
19 | ![img1](./doc/img3.png)
20 | ![img1](./doc/img4.png)
21 | 
22 | ## Git clone
23 | 
24 | ```shell
25 | git clone https://github.com/osnsyc/Scholar-to-RSS.git
26 | cd Scholar-to-RSS
27 | ```
28 | 
29 | ```python
30 | pip install beautifulsoup4 msal
31 | ```
32 | 
33 | ## Config 
34 | 
35 | ```ini
36 | # config.ini
37 | [Outlook]
38 | APP_ID = 12345678-1234-1234-1234-1234567890
39 | 
40 | ```
41 | 
42 | ## Mail settings
43 | 
44 | Set Outlook mail as Alert email in Google Scholar
45 | 
46 | **or**
47 | 
48 | Set Outlook mail as Forwarded email in Gmail
49 | 
50 | ## Run
51 | 
52 | ```python
53 | python scholar_to_rss.py
54 | ```
55 | Add certificate using Microsoft Graph on first run：https://microsoft.com/devicelogin，type in your user_code
56 | 
57 | ```python
58 | python scholar_server.py
59 | ```
60 | 
61 | ## RSS Subscription
62 | 
63 | `http://YOUR_HOST:9278/scholar.xml`


--------------------------------------------------------------------------------
/scholar_to_rss.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import time
  5 | import json
  6 | import requests
  7 | import configparser
  8 | from datetime import datetime, timedelta
  9 | from bs4 import BeautifulSoup
 10 | import msal
 11 | 
 12 | class Scholar2RSS:
 13 | 
 14 |     def __init__(self, APP_ID):
 15 |         self.APP_ID = APP_ID
 16 |         self.GRAPH_ENDPOINT = 'https://graph.microsoft.com/v1.0'
 17 |         self.SCOPES = ['Mail.ReadWrite']
 18 |         self.MS_API_TOKEN = './ms_graph_api_token.json'
 19 |         self.XML_PATH = './scholar.xml'
 20 | 
 21 |     def convert_to_timestamp(self, date_string):
 22 |         date_obj = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z")
 23 |         return date_obj.timestamp()
 24 | 
 25 |     def generate_access_token(self):
 26 |         # Save Session Token as a token file
 27 |         access_token_cache = msal.SerializableTokenCache()
 28 | 
 29 |         # read the token file
 30 |         if os.path.exists(self.MS_API_TOKEN):
 31 |             access_token_cache.deserialize(open(self.MS_API_TOKEN, "r").read())
 32 |             token_detail = json.load(open(self.MS_API_TOKEN,))
 33 |             token_detail_key = list(token_detail['AccessToken'].keys())[0]
 34 |             token_expiration = datetime.fromtimestamp(int(token_detail['AccessToken'][token_detail_key]['expires_on']))
 35 |             # if datetime.now() > token_expiration:
 36 |             #     os.remove(self.MS_API_TOKEN)
 37 |             #     access_token_cache = msal.SerializableTokenCache()
 38 | 
 39 |         # assign a SerializableTokenCache object to the client instance
 40 |         client = msal.PublicClientApplication(client_id=self.APP_ID, token_cache=access_token_cache)
 41 | 
 42 |         accounts = client.get_accounts()
 43 |         if accounts:
 44 |             # load the session
 45 |             token_response = client.acquire_token_silent(self.SCOPES, accounts[0])
 46 |         else:
 47 |             # authetnicate your accoutn as usual
 48 |             flow = client.initiate_device_flow(scopes=self.SCOPES)
 49 |             print('Open https://microsoft.com/devicelogin, user_code: ' + flow['user_code'])
 50 |             token_response = client.acquire_token_by_device_flow(flow)
 51 | 
 52 |         with open(self.MS_API_TOKEN, 'w') as _f:
 53 |             _f.write(access_token_cache.serialize())
 54 | 
 55 |         return token_response
 56 | 
 57 |     def get_mail(self):
 58 |         endpoint = self.GRAPH_ENDPOINT + '/me/messages'
 59 |         access_token = self.generate_access_token()
 60 |         headers = {'Authorization': 'Bearer ' + access_token['access_token']}
 61 |         request_body = {
 62 |             '$select': 'sender, subject, body',
 63 |             'filter': 'isRead eq false and from/emailAddress/address eq \'scholaralerts-noreply@google.com\''
 64 |         }
 65 | 
 66 |         response = requests.get(endpoint, headers=headers, params=request_body)
 67 |         if response.status_code == 200:
 68 |             content = json.loads(response.text)
 69 |             if content['value']:
 70 |                 return content['value']
 71 |             else:
 72 |                 print('No new mail.')
 73 |                 return None
 74 |         else:
 75 |             print(response.text)
 76 |             return None
 77 |         
 78 |     def mark_mail_as_read(self, id):
 79 |         access_token = self.generate_access_token()
 80 |         headers = {
 81 |             'Authorization': 'Bearer ' + access_token['access_token'],
 82 |             'Content-Type': 'application/json',
 83 |         }
 84 |         request_body = {'isRead': True}
 85 | 
 86 |         endpoint = self.GRAPH_ENDPOINT + '/me/messages/' + id
 87 |         response = requests.patch(endpoint, headers=headers, data=json.dumps(request_body))
 88 |         if response.status_code != 200:
 89 |             print(response.text)
 90 | 
 91 |     def update_xml_file(self, mail):
 92 |         # Create xml file
 93 |         if not os.path.exists(self.XML_PATH):
 94 |             content = '<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"><channel>'\
 95 |                     + '<title><![CDATA[' + "Google Scholar Alert" + ']]></title>' \
 96 |                     + '<link>' + 'https://scholar.google.com/' + '</link>'\
 97 |                     + '<description><![CDATA[' + "Google Scholar" + ']]></description>' \
 98 |                     + '<language>zh-cn</language>' \
 99 |                     + '</channel></rss>'
100 |             with open(self.XML_PATH, 'w') as file:
101 |                 file.write(content)
102 | 
103 |         # Read xml file
104 |         xmlContent = ''
105 |         with open(self.XML_PATH, 'r') as file:
106 |             xmlContent = file.read()
107 |         xmlContent = BeautifulSoup(xmlContent.replace('link>','temptlink>'),'lxml')
108 | 
109 |         # parse mail content
110 |         mail_title = mail['subject']
111 | 
112 |         mail_content = BeautifulSoup(mail["body"]["content"], 'html.parser')
113 |         links, titles, authors, abstracts = [], [], [], []
114 |         subjects = mail_content.find_all('a', class_=lambda value: value and 'alrt_title' in value)
115 |         for subject in subjects:
116 |             links.append(subject.get('href'))
117 |             titles.append(subject.get_text(strip=True))
118 |         subjects = mail_content.select('div[style*="color:#006621"]')
119 |         for subject in subjects:
120 |             authors.append(subject.get_text(strip=True))
121 |         subjects = mail_content.find_all('div', class_=lambda value: value and 'alrt_sni' in value)
122 |         for subject in subjects:
123 |             abstracts.append(subject.get_text(strip=True))
124 | 
125 |         # Concreate new content
126 |         for index, link in enumerate(links):
127 |             elementContent = '<title><![CDATA[' + mail_title + '. ' + titles[index] + ']]></title>' \
128 |                             + '<description><![CDATA[' + "<p><b>" + authors[index] + "</b></p>" \
129 |                             + "<p>" + "<b>Abstract:</b>" + abstracts[index] + "</p>" + ']]></description>'\
130 |                             + '<temptlink>' + links[index] + '</temptlink>'\
131 |                             + '<pubDate>' + time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime(int(time.time()))) + '</pubDate>'
132 |             
133 |             parent = xmlContent.select_one('channel')
134 |             new_item = xmlContent.new_tag('item')
135 |             new_item.string = elementContent
136 |             parent.append(new_item)
137 |                             
138 |         xmlContent = BeautifulSoup(str(xmlContent.body.contents[0]).replace('&amp;','&').replace('&lt;','<').replace('&gt;','>'),'lxml')
139 |         items = xmlContent.find_all('item')
140 |         # sort <item> by pubDate
141 |         sorted_items = sorted(items, key=lambda x: self.convert_to_timestamp(x.select_one('pubDate').text), reverse=True)
142 | 
143 |         # remove <item> older than 2 weeks if there are more than 100 <item>
144 |         if len(sorted_items) > 100:
145 |             # get timestamp of 2 weeks ago
146 |             two_week_ago = datetime.now() - timedelta(days=14)
147 |             two_week_ago_timestamp = time.mktime(two_week_ago.timetuple())
148 |             for item in sorted_items.copy():
149 |                 pub_date = item.select_one('pubDate').text
150 |                 pub_date_timestamp = self.convert_to_timestamp(pub_date)
151 |                 if pub_date_timestamp < two_week_ago_timestamp:
152 |                     sorted_items.remove(item)
153 | 
154 |         # remove all <item> in xmlContent
155 |         items_in_xmlContent = xmlContent.find_all('item')
156 |         for item in items_in_xmlContent:
157 |             item.extract()
158 |         
159 |         # append sorted <item> to xmlContent
160 |         parent_element = xmlContent.find('channel')
161 |         for sorted_item in sorted_items:
162 |             parent_element.append(sorted_item)
163 | 
164 |         with open(self.XML_PATH, 'w') as f:
165 |             f.write(str(xmlContent.body.contents[0]).replace('&lt;','<').replace('&gt;','>').replace('temptlink','link'))
166 |         
167 | if __name__ == '__main__':
168 | 
169 |     config = configparser.ConfigParser()
170 |     config.read('./config.ini')
171 |     APP_ID = config.get('Config', 'APP_ID')
172 | 
173 |     scholar2rss = Scholar2RSS(APP_ID)
174 | 
175 |     mails = scholar2rss.get_mail()
176 | 
177 |     # Extract content from mails
178 |     if mails:
179 |         for mail in mails:
180 |             scholar2rss.update_xml_file(mail)
181 |     else:
182 |         exit()
183 | 
184 |     # mark mail as read
185 |     for mail in mails:
186 |         scholar2rss.mark_mail_as_read(mail['id'])


--------------------------------------------------------------------------------