├── doc ├── img1.png ├── img2.png ├── img3.png └── img4.png ├── scholar_server.py ├── README.md └── scholar_to_rss.py /doc/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img1.png -------------------------------------------------------------------------------- /doc/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img2.png -------------------------------------------------------------------------------- /doc/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img3.png -------------------------------------------------------------------------------- /doc/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/osnsyc/Scholar-to-RSS/HEAD/doc/img4.png -------------------------------------------------------------------------------- /scholar_server.py: -------------------------------------------------------------------------------- 1 | import http.server 2 | import socketserver 3 | 4 | port = 9278 5 | 6 | Handler = http.server.SimpleHTTPRequestHandler 7 | 8 | with socketserver.TCPServer(("", port), Handler) as httpd: 9 | print(f"Serving at port {port}") 10 | httpd.serve_forever() 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | Use Outlook email to receive Google Scholar Alert emails and parse them into an RSS feed. 4 | 5 | ## Changelog 6 | 7 | - `2024-12-30`: You have to register for an [Azure](https://azure.microsoft.com/) account (free) to get started with Microsoft Entra ID. 8 | 9 | ## Create an azure account 10 | 11 | - https://azure.microsoft.com/ 12 | 13 | ## MS_GRAPH token (Outlook) 14 | 15 | Creat app:https://entra.microsoft.com/#home 16 | 17 | ![img1](./doc/img1.png) 18 | ![img1](./doc/img2.png) 19 | ![img1](./doc/img3.png) 20 | ![img1](./doc/img4.png) 21 | 22 | ## Git clone 23 | 24 | ```shell 25 | git clone https://github.com/osnsyc/Scholar-to-RSS.git 26 | cd Scholar-to-RSS 27 | ``` 28 | 29 | ```python 30 | pip install beautifulsoup4 msal 31 | ``` 32 | 33 | ## Config 34 | 35 | ```ini 36 | # config.ini 37 | [Outlook] 38 | APP_ID = 12345678-1234-1234-1234-1234567890 39 | 40 | ``` 41 | 42 | ## Mail settings 43 | 44 | Set Outlook mail as Alert email in Google Scholar 45 | 46 | **or** 47 | 48 | Set Outlook mail as Forwarded email in Gmail 49 | 50 | ## Run 51 | 52 | ```python 53 | python scholar_to_rss.py 54 | ``` 55 | Add certificate using Microsoft Graph on first run:https://microsoft.com/devicelogin,type in your user_code 56 | 57 | ```python 58 | python scholar_server.py 59 | ``` 60 | 61 | ## RSS Subscription 62 | 63 | `http://YOUR_HOST:9278/scholar.xml` -------------------------------------------------------------------------------- /scholar_to_rss.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import time 5 | import json 6 | import requests 7 | import configparser 8 | from datetime import datetime, timedelta 9 | from bs4 import BeautifulSoup 10 | import msal 11 | 12 | class Scholar2RSS: 13 | 14 | def __init__(self, APP_ID): 15 | self.APP_ID = APP_ID 16 | self.GRAPH_ENDPOINT = 'https://graph.microsoft.com/v1.0' 17 | self.SCOPES = ['Mail.ReadWrite'] 18 | self.MS_API_TOKEN = './ms_graph_api_token.json' 19 | self.XML_PATH = './scholar.xml' 20 | 21 | def convert_to_timestamp(self, date_string): 22 | date_obj = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z") 23 | return date_obj.timestamp() 24 | 25 | def generate_access_token(self): 26 | # Save Session Token as a token file 27 | access_token_cache = msal.SerializableTokenCache() 28 | 29 | # read the token file 30 | if os.path.exists(self.MS_API_TOKEN): 31 | access_token_cache.deserialize(open(self.MS_API_TOKEN, "r").read()) 32 | token_detail = json.load(open(self.MS_API_TOKEN,)) 33 | token_detail_key = list(token_detail['AccessToken'].keys())[0] 34 | token_expiration = datetime.fromtimestamp(int(token_detail['AccessToken'][token_detail_key]['expires_on'])) 35 | # if datetime.now() > token_expiration: 36 | # os.remove(self.MS_API_TOKEN) 37 | # access_token_cache = msal.SerializableTokenCache() 38 | 39 | # assign a SerializableTokenCache object to the client instance 40 | client = msal.PublicClientApplication(client_id=self.APP_ID, token_cache=access_token_cache) 41 | 42 | accounts = client.get_accounts() 43 | if accounts: 44 | # load the session 45 | token_response = client.acquire_token_silent(self.SCOPES, accounts[0]) 46 | else: 47 | # authetnicate your accoutn as usual 48 | flow = client.initiate_device_flow(scopes=self.SCOPES) 49 | print('Open https://microsoft.com/devicelogin, user_code: ' + flow['user_code']) 50 | token_response = client.acquire_token_by_device_flow(flow) 51 | 52 | with open(self.MS_API_TOKEN, 'w') as _f: 53 | _f.write(access_token_cache.serialize()) 54 | 55 | return token_response 56 | 57 | def get_mail(self): 58 | endpoint = self.GRAPH_ENDPOINT + '/me/messages' 59 | access_token = self.generate_access_token() 60 | headers = {'Authorization': 'Bearer ' + access_token['access_token']} 61 | request_body = { 62 | '$select': 'sender, subject, body', 63 | 'filter': 'isRead eq false and from/emailAddress/address eq \'scholaralerts-noreply@google.com\'' 64 | } 65 | 66 | response = requests.get(endpoint, headers=headers, params=request_body) 67 | if response.status_code == 200: 68 | content = json.loads(response.text) 69 | if content['value']: 70 | return content['value'] 71 | else: 72 | print('No new mail.') 73 | return None 74 | else: 75 | print(response.text) 76 | return None 77 | 78 | def mark_mail_as_read(self, id): 79 | access_token = self.generate_access_token() 80 | headers = { 81 | 'Authorization': 'Bearer ' + access_token['access_token'], 82 | 'Content-Type': 'application/json', 83 | } 84 | request_body = {'isRead': True} 85 | 86 | endpoint = self.GRAPH_ENDPOINT + '/me/messages/' + id 87 | response = requests.patch(endpoint, headers=headers, data=json.dumps(request_body)) 88 | if response.status_code != 200: 89 | print(response.text) 90 | 91 | def update_xml_file(self, mail): 92 | # Create xml file 93 | if not os.path.exists(self.XML_PATH): 94 | content = ''\ 95 | + '<![CDATA[' + "Google Scholar Alert" + ']]>' \ 96 | + '' + 'https://scholar.google.com/' + ''\ 97 | + '' \ 98 | + 'zh-cn' \ 99 | + '' 100 | with open(self.XML_PATH, 'w') as file: 101 | file.write(content) 102 | 103 | # Read xml file 104 | xmlContent = '' 105 | with open(self.XML_PATH, 'r') as file: 106 | xmlContent = file.read() 107 | xmlContent = BeautifulSoup(xmlContent.replace('link>','temptlink>'),'lxml') 108 | 109 | # parse mail content 110 | mail_title = mail['subject'] 111 | 112 | mail_content = BeautifulSoup(mail["body"]["content"], 'html.parser') 113 | links, titles, authors, abstracts = [], [], [], [] 114 | subjects = mail_content.find_all('a', class_=lambda value: value and 'alrt_title' in value) 115 | for subject in subjects: 116 | links.append(subject.get('href')) 117 | titles.append(subject.get_text(strip=True)) 118 | subjects = mail_content.select('div[style*="color:#006621"]') 119 | for subject in subjects: 120 | authors.append(subject.get_text(strip=True)) 121 | subjects = mail_content.find_all('div', class_=lambda value: value and 'alrt_sni' in value) 122 | for subject in subjects: 123 | abstracts.append(subject.get_text(strip=True)) 124 | 125 | # Concreate new content 126 | for index, link in enumerate(links): 127 | elementContent = '<![CDATA[' + mail_title + '. ' + titles[index] + ']]>' \ 128 | + '" + authors[index] + "

" \ 129 | + "

" + "Abstract:" + abstracts[index] + "

" + ']]>
'\ 130 | + '' + links[index] + ''\ 131 | + '' + time.strftime("%a, %d %b %Y %H:%M:%S %z", time.localtime(int(time.time()))) + '' 132 | 133 | parent = xmlContent.select_one('channel') 134 | new_item = xmlContent.new_tag('item') 135 | new_item.string = elementContent 136 | parent.append(new_item) 137 | 138 | xmlContent = BeautifulSoup(str(xmlContent.body.contents[0]).replace('&','&').replace('<','<').replace('>','>'),'lxml') 139 | items = xmlContent.find_all('item') 140 | # sort by pubDate 141 | sorted_items = sorted(items, key=lambda x: self.convert_to_timestamp(x.select_one('pubDate').text), reverse=True) 142 | 143 | # remove older than 2 weeks if there are more than 100 144 | if len(sorted_items) > 100: 145 | # get timestamp of 2 weeks ago 146 | two_week_ago = datetime.now() - timedelta(days=14) 147 | two_week_ago_timestamp = time.mktime(two_week_ago.timetuple()) 148 | for item in sorted_items.copy(): 149 | pub_date = item.select_one('pubDate').text 150 | pub_date_timestamp = self.convert_to_timestamp(pub_date) 151 | if pub_date_timestamp < two_week_ago_timestamp: 152 | sorted_items.remove(item) 153 | 154 | # remove all in xmlContent 155 | items_in_xmlContent = xmlContent.find_all('item') 156 | for item in items_in_xmlContent: 157 | item.extract() 158 | 159 | # append sorted to xmlContent 160 | parent_element = xmlContent.find('channel') 161 | for sorted_item in sorted_items: 162 | parent_element.append(sorted_item) 163 | 164 | with open(self.XML_PATH, 'w') as f: 165 | f.write(str(xmlContent.body.contents[0]).replace('<','<').replace('>','>').replace('temptlink','link')) 166 | 167 | if __name__ == '__main__': 168 | 169 | config = configparser.ConfigParser() 170 | config.read('./config.ini') 171 | APP_ID = config.get('Config', 'APP_ID') 172 | 173 | scholar2rss = Scholar2RSS(APP_ID) 174 | 175 | mails = scholar2rss.get_mail() 176 | 177 | # Extract content from mails 178 | if mails: 179 | for mail in mails: 180 | scholar2rss.update_xml_file(mail) 181 | else: 182 | exit() 183 | 184 | # mark mail as read 185 | for mail in mails: 186 | scholar2rss.mark_mail_as_read(mail['id']) --------------------------------------------------------------------------------