├── README.md
├── autoliterature
    ├── __init__.py
    ├── arxiv.py
    ├── autoliter.py
    ├── crossref.py
    ├── downloads.py
    ├── medbiorxiv.py
    ├── pdfs.py
    └── utils.py
├── doc
    └── autolter_example.ipynb
├── requirements.txt
└── setup.py


/README.md:
--------------------------------------------------------------------------------
 1 | # autoLiterature
 2 | **autoLiterature**是一个基于Python的自动文献管理命令行工具。Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4).   
 3 | 
 4 | 
 5 | **识别规则：**
 6 | - 自动识别 `- {xxx}`。
 7 | - 当笔记文件中包含`- {paper_id}`时候，会下载该文献的信息，**不下载PDF**。
 8 | - 当笔记文件中包含`- {{paper_id}}`时候，会下载该文献的信息，以及PDF。
 9 | 
10 | 注意：`paper_id`支持已发表文章的`doi`,预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。
11 | 
12 | ## 安装
13 | 1. pip 安装
14 | ```bash 
15 | pip install autoliter
16 | 或者
17 | pip3 install autoliter
18 | ```
19 | 
20 | 2. 源码安装
21 | ```bash
22 | git clone https://github.com/WilmerWang/autoLiterature.git
23 | cd autoLiterature
24 | python setup.py install 
25 | ```
26 | 
27 | ### 软件参数
28 | ```bash
29 | autolter
30 | 
31 | optional arguments:
32 |   -h, --help            show this help message and exit
33 |   -i INPUT, --input INPUT
34 |                         The path to the note file or note file folder.
35 |   -o OUTPUT, --output OUTPUT
36 |                         Folder path to save paper pdfs and iamges. NOTE: MUST BE FOLDER
37 |   -p PROXY, --proxy PROXY
38 |                         The proxy. e.g. 127.0.0.1:1080
39 |   -d, --delete          Delete unreferenced attachments in notes. Use with caution,
40 |                         when used, -i must be a folder path including all notes
41 |   -m MIGRATION, --migration MIGRATION
42 |                         the pdf folder path you want to reconnect to
43 | ```
44 | 
45 | ## 使用
46 | ### 基本使用
47 | 假设`input`为文献笔记(md文件)的文件夹路径，`output`为要保存PDF的文件夹路径。
48 | 
49 | ```bash
50 | # 更新input文件夹下所有md文件
51 | autoliter -i input -o output 
52 | 
53 | # 仅更新input/example.md文件
54 | autoliter -i input/example.md -o output  
55 | 
56 | # -d 是个可选项，当 -i 是文件夹路径时候，使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件
57 | autoliter -i input -o output -d
58 | ```
59 | 
60 | ### 迁移笔记和PDF文件
61 | 当要移动文献笔记或者PDF文件夹的时候，文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。
62 | 
63 | ```bash
64 | # 更新input文件夹下所有md文件
65 | autoliter -i input -m movedPDFs/
66 | 
67 | # 仅更新input/example.md文件
68 | autoliter -i input/example.md -m movedPDFs/  
69 | ```
70 | 
71 | 更多可以本地查看[jupyter note](doc/autolter_example.ipynb)，或者在线查看[github](https://github.com/WilmerWang/autoLiterature.git) doc文件夹。
72 | 
73 | ## License
74 | MIT


--------------------------------------------------------------------------------
/autoliterature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wilmerwang/autoLiterature/22d0983b2693c8fc878308dab6358282e1023c58/autoliterature/__init__.py


--------------------------------------------------------------------------------
/autoliterature/arxiv.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import requests 
  3 | from urllib.request import ProxyHandler
  4 | import feedparser
  5 | try:
  6 |     from urllib import quote
  7 | except ImportError:
  8 |     from urllib.parse import quote
  9 | from unidecode import unidecode
 10 | 
 11 | from .crossref import crossrefInfo
 12 | 
 13 | 
 14 | logging.basicConfig()
 15 | logger = logging.getLogger('arxiv')
 16 | logger.setLevel(logging.DEBUG)
 17 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
 18 | 
 19 | class arxivInfo(object):
 20 |     def __init__(self):
 21 |         self.base_url = "http://export.arxiv.org/api/query"
 22 |     
 23 |     def set_proxy_handler(self, proxy):
 24 |         """set proxy handler
 25 |         
 26 |         Aargs: 
 27 |             proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
 28 |             
 29 |         Returns:
 30 |             A proxy handler object.
 31 |         """
 32 |         proxy_handler = ProxyHandler({"http": f"http://{proxy}",
 33 |                                       "https": f"https://{proxy}"})
 34 |         return proxy_handler
 35 |             
 36 |     
 37 |     def extract_json_info(self, item):
 38 |         """Extract bib json information from requests.get().json()
 39 |         
 40 |         Args:
 41 |             item (json object): obtained by requests.get().json()
 42 |         
 43 |         Returns:
 44 |             A dict containing the paper information.
 45 |         """
 46 |         paper_url = item.link 
 47 |         title = item.title
 48 |         journal = "arxiv"
 49 |         published = item.published.split("-")
 50 |         if len(published) > 1:
 51 |             year = published[0]
 52 |         else: 
 53 |             year = ' '
 54 | 
 55 |         authors = item.authors
 56 |         if len(authors) > 0:
 57 |             first_author = authors[0]["name"].split(" ")
 58 |             authors = " and ".join([author["name"] for author in authors])
 59 |         else:
 60 |             first_author = authors
 61 |             authors = authors
 62 | 
 63 |         bib_dict = {
 64 |             "title": title,
 65 |             "author": authors,
 66 |             "journal": journal,
 67 |             "year": year,
 68 |             "url": paper_url,
 69 |             "pdf_link": item.link.replace("abs", "pdf")+".pdf",
 70 |             "cited_count": None
 71 |         }
 72 |         
 73 |         return bib_dict
 74 | 
 75 | 
 76 |     def get_info_by_arxivid(self, arxivId, handler=False):
 77 |         """Get the meta information by the given paper arxiv_id. 
 78 |         
 79 |         Args:
 80 |             doi (str): The arxiv Id
 81 |             handler (handler object): use proxy
 82 |             
 83 |         Returns:
 84 |             A dict containing the paper information. 
 85 |             {
 86 |                 "title": xxx,
 87 |                 "author": xxx,
 88 |                 "journal": xxx,
 89 |                 etc
 90 |             } 
 91 |             OR
 92 |             None
 93 |         """
 94 |         
 95 |         params = "?search_query=id:"+quote(unidecode(arxivId))
 96 |         
 97 |         try:
 98 |             if handler:
 99 |                 result = feedparser.parse(self.base_url + params, handlers=[handler])
100 |             else:
101 |                 result = feedparser.parse(self.base_url  + params)
102 |             items = result.entries
103 | 
104 |             item = items[0]
105 |             if "arxiv_doi" in item:
106 |                 doi = item["arxiv_doi"]
107 |                 
108 |                 crossref_info = crossrefInfo()
109 |                 if handler:
110 |                     crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1])
111 |                 return crossref_info.get_info_by_doi(doi)
112 |             else:
113 |                 return self.extract_json_info(item)
114 |         except:
115 |             logger.error("DOI: {} is error.".format(arxivId))
116 |             
117 |     
118 |     def get_info_by_title(self, title, field='ti'):
119 |         """Get the meta information by the given paper title. 
120 |         
121 |         Args:
122 |             doi (str): The paper title
123 |             
124 |         Returns:
125 |             A dict containing the paper information. 
126 |             {
127 |                 "title": xxx,
128 |                 "author": xxx,
129 |                 "journal": xxx,
130 |                 etc
131 |             }
132 |             OR
133 |             None
134 |             OR
135 |             A list [{}, {}, {}]
136 |         """
137 |         params = "?search_query="+field+":"+quote(unidecode(title))
138 |         url = self.base_url + params
139 |         try:
140 |             result = feedparser.parse(url)
141 |             items = result.entries
142 |             print(len(items))
143 |             
144 |             for i, item in enumerate(items):
145 |                 
146 |                 title_item = item.title
147 |                 try:
148 |                     title_item = title_item.decode("utf-8")
149 |                 except:
150 |                     pass
151 |             
152 |                 item.title = title_item
153 | 
154 |                 if title_item.lower() == title.lower():
155 |                     return self.extract_json_info(item)
156 |                 
157 |                 items[i] = item
158 | 
159 |             return [self.extract_json_info(it) for it in items]
160 |         except:
161 |             logger.error("Title: {} is error.".format(title)) 
162 |             
163 |             
164 | if __name__ == "__main__":
165 |     arxivId = "2208.05623"
166 |     title = "Heterogeneous Graph Attention Network"
167 |     
168 |     arxiv_info = arxivInfo()
169 |     arxiv_info.set_proxy_handler(proxy="127.0.1:1123")
170 |     
171 |     bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId)
172 |     # bib_title = arxiv_info.get_info_by_title(title)
173 |     
174 |     print(bib_arxiv)
175 |     print("\n")
176 |     # print(bib_title)


--------------------------------------------------------------------------------
/autoliterature/autoliter.py:
--------------------------------------------------------------------------------
  1 | import logging 
  2 | import argparse
  3 | import os 
  4 | 
  5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger('AutoLiter')
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | def set_args():
 13 |     parser = argparse.ArgumentParser(description='autoLiterature')
 14 |     parser.add_argument('-i', '--input', required=True, type=str, default=None,
 15 |                         help="The path to the note file or note file folder.")
 16 |     parser.add_argument('-o', '--output', type=str, default=None,
 17 |                         help='Folder path to save paper pdfs and iamges. NOTE: MUST BE FOLDER')
 18 |     parser.add_argument('-p', '--proxy', type=str, default=None, 
 19 |                         help='The proxy. e.g. 127.0.0.1:1080')
 20 |     parser.add_argument('-d', '--delete', action='store_true',
 21 |                         help='Delete unreferenced attachments in notes. Use with caution, '
 22 |                         'when used, -i must be a folder path including all notes')
 23 |     parser.add_argument('-m', '--migration', type=str, default=None, 
 24 |                         help="the pdf folder path you want to reconnect to")
 25 |     args = parser.parse_args()
 26 |     
 27 |     return args 
 28 | 
 29 | def check_args():
 30 |     args = set_args()
 31 |     input_path = args.input
 32 |     output_path = args.output 
 33 |     delete_bool = args.delete
 34 |     migration_path = args.migration
 35 |     proxy = args.proxy
 36 |         
 37 |     return input_path, output_path, delete_bool, proxy, migration_path
 38 | 
 39 | 
 40 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer):
 41 |     
 42 |     pdfs_path = output_path
 43 |     if not os.path.exists(pdfs_path):
 44 |         os.makedirs(pdfs_path)
 45 |     
 46 |     with open(note_file, 'r') as f:
 47 |         content = f.read()
 48 |             
 49 |     m = paper_recognizer.findall(content)
 50 |     logger.info("需要下载的文献个数 -  {}".format(len(m)))
 51 | 
 52 |     if not m:
 53 |         logger.info("未找到需要下载的文献, 文件 {} 未更新.".format(note_file))
 54 |     else:
 55 |         # TODO add pd_online link in note file
 56 |         replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy)
 57 |             
 58 |         return replace_dict
 59 | 
 60 | 
 61 | def file_update(input_path, output_path, proxy, paper_recognizer):
 62 |     
 63 |     replace_dict =  get_bib_and_pdf(input_path, output_path,
 64 |                                     proxy, paper_recognizer)
 65 |     
 66 |     if replace_dict:
 67 |         note_modified(paper_recognizer, input_path, **replace_dict)
 68 | 
 69 | 
 70 | def main():
 71 |     input_path, output_path, delete_bool, proxy, migration_path = check_args()
 72 |     
 73 |     if output_path:
 74 |         paper_recognizer = patternRecognizer(r'- \{.{3,}\}')
 75 |         
 76 |         if os.path.isfile(input_path):
 77 |             logger.info("正在更新文件 {}".format(input_path))
 78 |             file_update(input_path, output_path, proxy, paper_recognizer)
 79 |             
 80 |         elif os.path.isdir(input_path):
 81 |             note_paths = []
 82 |             for root, _, files in os.walk(input_path):
 83 |                 for file in files:
 84 |                     if file.lower().endswith('md') or file.lower().endswith('markdown'):
 85 |                         note_paths.append(os.path.join(root, file))
 86 |             for note_path in note_paths:
 87 |                 logger.info("正在更新文件 {}".format(note_path))
 88 |                 file_update(note_path, output_path, proxy, paper_recognizer)
 89 |         else:
 90 |             logger.info("input path {} is not exists".format(input_path))
 91 |     
 92 |     
 93 |         # Delete unreferenced attachments
 94 |         if delete_bool:
 95 |             if os.path.isfile(input_path):
 96 |                 logger.info("若要删除笔记无关PDF实体, 输入的路径必须是笔记总文件夹!!!请谨慎使用该参数!!!")
 97 |             else:
 98 |                 pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
 99 |                 pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer)
100 |                 pdf_paths = get_pdf_paths(output_path)
101 |                 # TODO mac 和 win 之间路径可能会不同，“/” 和 “\\”
102 |                 pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes]
103 |                 pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths]
104 |                 
105 |                 removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes))
106 |                 try:
107 |                     for pdf_p in removed_pdf_paths:
108 |                         os.remove(pdf_p)
109 |                 except:
110 |                     pass 
111 |                 
112 |                 logger.info("已删除 {} 个PDF文件".format(len(removed_pdf_paths)))
113 |             
114 |     
115 |     if migration_path:
116 |         pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
117 |         
118 |         pdf_paths = get_pdf_paths(migration_path)
119 |         pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer)
120 |         
121 |         # match based on paper title
122 |         matched_numb = 0
123 |         pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths}
124 |         for md_file, pdf_paths_ in  pdf_paths_in_notes.items():
125 |                 
126 |             pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_}
127 |             matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys()
128 |             
129 |             matched_numb += len(matched_pdfs)
130 | 
131 |             # os.path.relpath(pdf_path, note_file).split('/',1)[-1]
132 |             replace_paths_dict = {}
133 |             for matched in matched_pdfs:
134 |                 replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1]
135 |                 replaced_str = "[pdf]({})".format(replaced_str)
136 |                 ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched])
137 |                 replace_paths_dict[ori_str] = replaced_str
138 |             
139 |             if replace_paths_dict: 
140 |                 note_modified(pdf_path_recognizer, md_file, **replace_paths_dict)
141 |         
142 |         logger.info("共匹配到 - {} - 个PDF文件".format(matched_numb))
143 |         
144 | 
145 |     if not output_path and not migration_path:
146 |         logger.info("缺少关键参数 -o 或者 -m, 程序未运行, 请使用 -h 查看具体信息")
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     main()


--------------------------------------------------------------------------------
/autoliterature/crossref.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import requests 
  3 | # 
  4 | # 1. get info by doi
  5 | # 2. get info by title
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger('crossref')
  9 | logger.setLevel(logging.DEBUG)
 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
 11 | 
 12 | class crossrefInfo(object):
 13 |     def __init__(self):
 14 |         self.sess = requests.Session()
 15 |         self.sess.headers = HEADERS
 16 |         self.base_url = "http://api.crossref.org/"
 17 | 
 18 |     def set_proxy(self, proxy=None):
 19 |         """set proxy for session
 20 |         
 21 |         Args:
 22 |             proxy (str): The proxy adress. e.g 127.0.1:1123
 23 |         Returns:
 24 |             None
 25 |         """
 26 |         if proxy:
 27 |             self.sess.proxies = {
 28 |                 "http": proxy,
 29 |                 "https": proxy, }
 30 |             
 31 |     
 32 |     def extract_json_info(self, bib):
 33 |         """Extract bib json information from requests.get().json()
 34 |         
 35 |         Args:
 36 |             bib (json object): obtained by requests.get().json()
 37 |         
 38 |         Returns:
 39 |             A dict containing the paper information.
 40 |         """
 41 |         pub_date = [str(i) for i in bib['published']["date-parts"][0]]
 42 |         pub_date = '-'.join(pub_date)
 43 | 
 44 |         if 'author' in bib.keys():
 45 |             authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()])
 46 |         else:
 47 |             authors = "No author"
 48 | 
 49 |         if 'short-container-title' in bib.keys():
 50 |             try:
 51 |                 journal = bib['short-container-title'][0]
 52 |             except:
 53 |                 journal = "No journal"
 54 |         else:
 55 |             try:
 56 |                 journal = bib['container-title'][0]
 57 |             except:
 58 |                 journal = "No journal"
 59 | 
 60 |         bib_dict = {
 61 |             "title": bib['title'][0],
 62 |             "author": authors,
 63 |             "journal": journal,
 64 |             "year": pub_date,
 65 |             "url": bib["URL"],
 66 |             "pdf_link": bib["link"][0]["URL"],
 67 |             "cited_count": bib["is-referenced-by-count"]
 68 |         } 
 69 |         
 70 |         return bib_dict
 71 | 
 72 | 
 73 |     def get_info_by_doi(self, doi):
 74 |         """Get the meta information by the given paper DOI number. 
 75 |         
 76 |         Args:
 77 |             doi (str): The paper DOI number
 78 |             
 79 |         Returns:
 80 |             A dict containing the paper information. 
 81 |             {
 82 |                 "title": xxx,
 83 |                 "author": xxx,
 84 |                 "journal": xxx,
 85 |                 etc
 86 |             } 
 87 |             OR
 88 |             None
 89 |         """
 90 |         url = "{}works/{}"
 91 |         url = url.format(self.base_url, doi)
 92 |         
 93 |         try:
 94 |             r = self.sess.get(url)
 95 | 
 96 |             bib = r.json()['message']
 97 |             return self.extract_json_info(bib)
 98 |             
 99 |         except:
100 |             logger.error("DOI: {} is error.".format(doi)) 
101 |             
102 |     
103 |     def get_info_by_title(self, title):
104 |         """Get the meta information by the given paper title. 
105 |         
106 |         Args:
107 |             doi (str): The paper title
108 |             
109 |         Returns:
110 |             A dict containing the paper information. 
111 |             {
112 |                 "title": xxx,
113 |                 "author": xxx,
114 |                 "journal": xxx,
115 |                 etc
116 |             }
117 |             OR
118 |             None
119 |             OR
120 |             A list [{}, {}, {}]
121 |         """
122 |         url = self.base_url + "works"
123 |         params = {"query.bibliographic": title, "rows": 20}
124 |         try:
125 |             r = self.sess.get(url, params=params)
126 |             items = r.json()["message"]["items"]
127 |             
128 |             for i, item in enumerate(items):
129 |                 
130 |                 title_item = item['title'][0]
131 |                 try:
132 |                     title_item = title_item.decode("utf-8")
133 |                 except:
134 |                     pass
135 |             
136 |                 item["title"][0] = title_item
137 | 
138 |                 if title_item.lower() == title.lower():
139 |                     return self.extract_json_info(item)
140 |                 
141 |                 items[i] = item
142 | 
143 |             return [self.extract_json_info(it) for it in items]
144 |         except:
145 |             logger.error("Title: {} is error.".format(title)) 
146 |             
147 |             
148 | if __name__ == "__main__":
149 |     # doi = "10.1016/j.wneu.2012.11.074"
150 |     # doi = "10.1093/cercor/bhac266"
151 |     doi = "10.1038/s41467-022-29269-6"
152 |     # title = "Heterogeneous Graph Attention Network"
153 |     # title = "Learning to Copy Coherent Knowledge for Response Generation"
154 |     
155 |     crossref_info = crossrefInfo()
156 |     crossref_info.set_proxy(proxy="127.0.1:1123")
157 |     
158 |     bib_doi = crossref_info.get_info_by_doi(doi)
159 |     # bib_title = crossref_info.get_info_by_title(title)
160 |     
161 |     print(bib_doi)
162 |     print("\n")
163 |     # print(bib_title)
164 | 


--------------------------------------------------------------------------------
/autoliterature/downloads.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re 
  3 | import os 
  4 | 
  5 | from .arxiv import arxivInfo
  6 | from .crossref import crossrefInfo
  7 | from .medbiorxiv import BMxivInfo
  8 | from .pdfs import pdfDownload
  9 | 
 10 | # log config
 11 | logging.basicConfig()
 12 | logger = logging.getLogger('Downloads')
 13 | logger.setLevel(logging.INFO)
 14 | 
 15 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
 16 | 
 17 | 
 18 | 
 19 | def check_string(re_exp, str):
 20 |     res = re.match(re_exp, str)
 21 |     if res:
 22 |         return True
 23 |     else:
 24 |         return False
 25 | 
 26 | def classify(identifier):
 27 |     """
 28 |     Classify the type of paper_id:
 29 |     arxivId - arxivId
 30 |     doi - digital object identifier
 31 |     medbiorxivId - medrxiv or biorxiv id
 32 |     title - title
 33 |     """
 34 |     if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier):
 35 |         return 'doi'
 36 |     elif check_string(r'10\.1101/\.*', identifier):
 37 |         return "medbiorxivId"
 38 |     elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier):
 39 |         return 'arxivId'
 40 |     elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier):
 41 |         return 'title'
 42 |     else:
 43 |         return "unrecognized"
 44 |     
 45 | def get_paper_info_from_paperid(paper_id, proxy=None):
 46 |     id_type = classify(paper_id)
 47 |     
 48 |     if id_type == "doi":
 49 |         downloader = crossrefInfo()
 50 |         if proxy:
 51 |             downloader.set_proxy(proxy=proxy)
 52 |         bib_dict = downloader.get_info_by_doi(paper_id)
 53 |         
 54 |     elif id_type == "arxivId":
 55 |         downloader = arxivInfo()
 56 |         if proxy:
 57 |             downloader.set_proxy_handler(proxy=proxy)
 58 |         bib_dict = downloader.get_info_by_arxivid(paper_id)
 59 |         
 60 |     elif id_type == "medbiorxivId":
 61 |         downloader = BMxivInfo()
 62 |         if proxy:
 63 |             downloader.set_proxy(proxy=proxy)
 64 |         bib_dict = downloader.get_info_by_bmrxivid(paper_id)
 65 | 
 66 |     elif id_type == "title":
 67 |         pass 
 68 |     else:
 69 |         pass 
 70 |     
 71 |     try:
 72 |         return bib_dict 
 73 |     except:
 74 |         pass 
 75 | 
 76 | 
 77 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None):
 78 |     pdf_downloader = pdfDownload()
 79 |     if proxy:
 80 |         pdf_downloader.set_proxy(proxy=proxy)
 81 |     
 82 |     if direct_url:
 83 |         content = pdf_downloader.get_pdf_from_direct_url(direct_url)
 84 |         if not content:
 85 |             content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
 86 |     else:
 87 |         content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
 88 |     
 89 |     try:
 90 |         if not os.path.exists(path.rsplit("/", 1)[0]):
 91 |             os.makedirs(path.rsplit("/", 1)[0])
 92 |         pdf_downloader._save(content['pdf'], path)
 93 |     except:
 94 |         pass 
 95 |     
 96 | 
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     doi = "10.1016/j.wneu.2012.11.074"
101 |     arxiv_id = "2208.05623"
102 |     medbiorxiv_id = "10.1101/2022.07.28.22277637"
103 |     undefine_name = "sjsldjfnadijjsl;kjdjf"
104 |     
105 |     print(get_paper_info_from_paperid(doi))
106 |     print(get_paper_info_from_paperid(arxiv_id))
107 |     print(get_paper_info_from_paperid(medbiorxiv_id))
108 |     print(get_paper_info_from_paperid(undefine_name))


--------------------------------------------------------------------------------
/autoliterature/medbiorxiv.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import requests 
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | from .crossref import crossrefInfo
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger('biorxiv')
  9 | logger.setLevel(logging.DEBUG)
 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
 11 | 
 12 | class BMxivInfo(object):
 13 |     def __init__(self):
 14 |         self.sess = requests.Session()
 15 |         self.sess.headers = HEADERS
 16 |         self.base_url = "https://api.biorxiv.org/details/"
 17 |         self.servers = ["biorxiv", "medrxiv"]
 18 |     
 19 |     
 20 |     def set_proxy(self, proxy=False):
 21 |         """set proxy for session
 22 |         
 23 |         Args:
 24 |             proxy (str): The proxy adress. e.g 127.0.1:1123
 25 |         Returns:
 26 |             None
 27 |         """
 28 |         if proxy:
 29 |             self.sess.proxies = {
 30 |                 "http": proxy,
 31 |                 "https": proxy, }
 32 |             
 33 |     
 34 |     def extract_json_info(self, item):
 35 |         """Extract bib json information from requests.get().json()
 36 |         
 37 |         Args:
 38 |             item (json object): obtained by requests.get().json()
 39 |         
 40 |         Returns:
 41 |             A dict containing the paper information.
 42 |         """
 43 |         paper_url = f"https://www.biorxiv.org/content/{item['doi']}"
 44 |         title = item["title"]
 45 |         journal = item["server"]
 46 |         published = item["date"].split('-')
 47 |         if len(published) > 1:
 48 |             year = published[0]
 49 |         else: 
 50 |             year = ' '
 51 | 
 52 |         authors = item['authors'].split("; ")
 53 |         if len(authors) > 0:
 54 |             authors = " and ".join([author for author in authors])
 55 |         else:
 56 |             authors = authors
 57 | 
 58 |         bib_dict = {
 59 |             "title": title,
 60 |             "author": authors,
 61 |             "journal": journal,
 62 |             "year": year,
 63 |             "url": paper_url,
 64 |             "pdf_link": f"{paper_url}.full.pdf",
 65 |             "cited_count": None
 66 |         }
 67 |         
 68 |         return bib_dict
 69 | 
 70 | 
 71 |     def get_info_by_bmrxivid(self, bmrxivid):
 72 |         """Get the meta information by the given paper biorxiv_id or medrxiv_id. 
 73 |         
 74 |         Args:
 75 |             doi (str): The biorxiv or medrxiv Id
 76 |             
 77 |         Returns:
 78 |             A dict containing the paper information. 
 79 |             {
 80 |                 "title": xxx,
 81 |                 "author": xxx,
 82 |                 "journal": xxx,
 83 |                 etc
 84 |             } 
 85 |             OR
 86 |             None
 87 |         """
 88 |         urls = [self.base_url + server + "/" + bmrxivid for server in self.servers]
 89 |         for url in urls:
 90 |             try:
 91 |                 r = self.sess.get(url)
 92 | 
 93 |                 bib = r.json()['collection'][-1]
 94 |                 
 95 |                 if "published" in bib.keys() and bib['published'] != "NA":
 96 |                     doi = bib["published"]
 97 |                     print(doi)
 98 |                     crossref_info = crossrefInfo()
 99 |                     if len(self.sess.proxies) > 0:
100 |                         crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1])
101 |                     return crossref_info.get_info_by_doi(doi)
102 |                  
103 |                 return self.extract_json_info(bib)
104 |                 
105 |             except:
106 |                 logger.error("DOI: {} is error.".format(bmrxivid)) 
107 |             
108 |     
109 |     def get_info_by_title(self, title):
110 |         """Get the meta information by the given paper title. 
111 |         
112 |         Args:
113 |             doi (str): The paper title
114 |             
115 |         Returns:
116 |             A dict containing the paper information. 
117 |             {
118 |                 "title": xxx,
119 |                 "author": xxx,
120 |                 "journal": xxx,
121 |                 etc
122 |             }
123 |             OR
124 |             None
125 |             OR
126 |             A list [{}, {}, {}]
127 |         """
128 |         base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard"
129 |         query = title.replace(' ', '%252B')
130 |         
131 |         url = base_url.format(query)
132 |         try:
133 |             result = self.sess.get(url)
134 |             soup = BeautifulSoup(result.content, "lxml")
135 |             soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix")
136 |             
137 |             soup_dict = dict()
138 |             for sp in soup_items:
139 |                 key = sp.find("a", class_="highwire-cite-linked-title").span.text
140 |                 value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "")
141 |                 soup_dict[key] = value
142 |             
143 |             for item_title, item_doi in soup_dict.items():
144 |                 try:
145 |                     item_title = item_title.decode("utf-8")
146 |                 except:
147 |                     pass
148 | 
149 |                 if item_title.lower() == title.lower():
150 |                     return self.get_info_by_bmrxivid(item_doi)
151 | 
152 |             return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()]
153 |         except:
154 |             logger.error("Title: {} is error.".format(title)) 
155 |             
156 |             
157 | if __name__ == "__main__":
158 |     
159 |     arxivId = "10.1101/2022.07.28.22277637"
160 |     # title = "Oxygen restriction induces a viable but non-culturable population in bacteria"
161 |     # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing"
162 |     # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19"
163 |     # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome"
164 |     
165 |     arxiv_info = BMxivInfo()
166 |     arxiv_info.set_proxy(proxy="127.0.1:1123")
167 |     
168 |     bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId)
169 |     # bib_title = arxiv_info.get_info_by_title(title)
170 |     
171 |     print(bib_arxiv)
172 |     print("\n")
173 |     # print(bib_title)


--------------------------------------------------------------------------------
/autoliterature/pdfs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import requests 
  3 | from urllib.parse import urlunsplit, urlsplit
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | logging.basicConfig()
  7 | logger = logging.getLogger('PDFs')
  8 | logger.setLevel(logging.DEBUG)
  9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
 10 | 
 11 | 
 12 | class pdfDownload(object):
 13 |     def __init__(self):
 14 |         self.sess = requests.Session()
 15 |         self.sess.headers = HEADERS
 16 |         
 17 |     def set_proxy(self, proxy=None):
 18 |         """set proxy for session
 19 |         
 20 |         Args:
 21 |             proxy (str): The proxy adress. e.g 127.0.1:1123
 22 |         Returns:
 23 |             None
 24 |         """
 25 |         if proxy:
 26 |             self.sess.proxies = {
 27 |                 "http": proxy,
 28 |                 "https": proxy, }
 29 |     
 30 |     
 31 |     def _get_available_scihub_urls(self):
 32 |         '''
 33 |         Finds available scihub urls via https://lovescihub.wordpress.com/ or 
 34 |         https://sci-hub.now.sh/
 35 |         '''
 36 |         urls = []
 37 |         res = self.sess.get('https://lovescihub.wordpress.com/')
 38 |         s = BeautifulSoup(res.content, 'html.parser')
 39 |         for a in s.find('div', class_="entry-content").find_all('a', href=True):
 40 |             if 'sci-hub.' in a['href']:
 41 |                 urls.append(a['href'])
 42 |         return urls
 43 |     
 44 |         
 45 |     def fetch(self, url, auth=None):
 46 |         '''Fetch pdf
 47 |         
 48 |         Args:
 49 |             url (str):
 50 | 
 51 |         Returns:
 52 |             A dict OR None
 53 |         '''
 54 |         try:
 55 |             r = self.sess.get(url, auth=auth)
 56 |         
 57 |             if r.headers["Content-Type"] != "application/pdf":
 58 |                 logger.info("Failed to fetch pdf with url: {}".format(url))
 59 |             else:
 60 |                 return {
 61 |                     'pdf': r.content,
 62 |                     'url': url
 63 |                     }
 64 |         except:
 65 |             logger.error("Failed to open url: {}".format(url))
 66 |     
 67 |     
 68 |     def get_pdf_from_direct_url(self, url, auth=None):
 69 |         return self.fetch(url, auth=auth) 
 70 |     
 71 |     
 72 |     def get_pdf_from_sci_hub(self, identifier, auth=None):
 73 |         '''Fetch pdf from sci-hub based on doi or url
 74 |         
 75 |         Args: 
 76 |             identifier (str): DOI or url
 77 |             auth (tuple): ("user", "passwd")
 78 |         
 79 |         Returns:
 80 |             A dict OR None
 81 |         '''
 82 |         for base_url in self._get_available_scihub_urls():
 83 |             r = self.sess.get(base_url + '/' + identifier, auth=auth)
 84 |             soup = BeautifulSoup(r.content, 'html.parser')
 85 |             
 86 |             pdf_div_names = ['iframe', 'embed']
 87 |             for pdf_div_name in pdf_div_names:
 88 |                 pdf_div = soup.find(pdf_div_name)
 89 |                 if pdf_div != None:
 90 |                     break 
 91 |             try:
 92 |                 url_parts = urlsplit(pdf_div.get('src'))
 93 |                 if url_parts[1]:
 94 |                     if url_parts[0]:
 95 |                         pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', ''))
 96 |                     else:
 97 |                         pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', ''))
 98 |                 else:
 99 |                     pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', ''))
100 |                     
101 |                 return self.fetch(pdf_url, auth)
102 |             except:
103 |                 pass
104 |     
105 |         logger.info("Failed to fetch pdf with all sci-hub urls")
106 | 
107 |     def _save(self, content, path):
108 |         with open(path, "wb") as f:
109 |             f.write(content)
110 |             
111 | 
112 | if __name__ == "__main__":
113 |     doi = "10.1145/3308558.3313562"
114 |     
115 |     pdf_download = pdfDownload()
116 |     pdf_download.set_proxy("127.0.1:1123")
117 |     
118 |     pdf_dict = pdf_download.get_pdf_from_sci_hub(doi)
119 |     if pdf_dict:
120 |         print(pdf_dict['url'])
121 |         pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf")
122 |         
123 |     # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf")
124 |     # if pdf_dict2:
125 |     #     print(pdf_dict2['url'])
126 |     #     pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf")
127 |     
128 |     


--------------------------------------------------------------------------------
/autoliterature/utils.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import logging
  3 | import re 
  4 | from tqdm import tqdm 
  5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger('utils')
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | class patternRecognizer(object):
 13 |     def __init__(self, regular_rule):
 14 |         self.pattern = re.compile(regular_rule)
 15 | 
 16 |     def match(self, string):
 17 |         return self.pattern.match(string)
 18 |     
 19 |     def findall(self, string):
 20 |         return self.pattern.findall(string)
 21 | 
 22 |     def multiple_replace(self, content, **replace_dict):
 23 |         def replace_(value):
 24 |             match = value.group()
 25 |             if match in replace_dict.keys():
 26 |                 return replace_dict[match]
 27 |             else:
 28 |                 return match+" **Not Correct, Check it**"
 29 |         
 30 |         replace_content = self.pattern.sub(replace_, content)
 31 |         
 32 |         return replace_content
 33 |     
 34 | 
 35 | def note_modified(pattern_recog, md_file, **replace_dict):
 36 |     with open(md_file, 'r') as f:
 37 |         content = f.read()
 38 |     
 39 |     replaced_content = pattern_recog.multiple_replace(content, **replace_dict)
 40 | 
 41 |     with open(md_file, 'w') as f:
 42 |         f.write(''.join(replaced_content))
 43 |         
 44 |  
 45 | def get_pdf_paths(pdf_root):
 46 |     pdf_paths = []
 47 |     for root, _, files in os.walk(pdf_root):
 48 |         for file in files:
 49 |             if file.lower().endswith('.pdf'):
 50 |                 pdf_paths.append(os.path.join(root, file))
 51 |                 
 52 |     return pdf_paths
 53 |  
 54 |         
 55 | def get_pdf_paths_from_notes(md_root, reg):
 56 |     
 57 |     md_files = []
 58 |     for root, _, files in os.walk(md_root):
 59 |         for file in files:
 60 |             if file.lower().endswith('md') or file.lower().endswith('markdown'):
 61 |                 md_files.append(os.path.join(root, file))
 62 |     
 63 |     pdf_paths_from_notes = []
 64 |     for md_file in md_files:
 65 |         with open(md_file, 'r') as f:
 66 |             content = f.read()
 67 |         m = reg.findall(content)
 68 |         m = [i.split("(")[-1].split(')')[0] for i in m]
 69 |         pdf_paths_from_notes.extend(m)
 70 | 
 71 |     return pdf_paths_from_notes
 72 | 
 73 | 
 74 | def get_pdf_paths_from_notes_dict(md_root, reg):
 75 |     pdf_paths_from_notes_dict = {}
 76 |     if os.path.isdir(md_root):
 77 |         md_files = []
 78 |         for root, _, files in os.walk(md_root):
 79 |             for file in files:
 80 |                 if file.lower().endswith('md') or file.lower().endswith('markdown'):
 81 |                     md_files.append(os.path.join(root, file))
 82 |     
 83 |         for md_file in md_files:
 84 |             with open(md_file, 'r') as f:
 85 |                 content = f.read()
 86 |             m = reg.findall(content)
 87 |             m = [i.split("(")[-1].split(')')[0] for i in m]
 88 |             pdf_paths_from_notes_dict[md_file] = m
 89 |     else:
 90 |         with open(md_root, 'r') as f:
 91 |             content = f.read()
 92 |         m = reg.findall(content)
 93 |         m = [i.split("(")[-1].split(')')[0] for i in m]
 94 |         pdf_paths_from_notes_dict[md_root] = m
 95 |             
 96 |     return pdf_paths_from_notes_dict
 97 | 
 98 | 
 99 | def classify_identifier(identifier):
100 |     """Not need to download PDF file 
101 |     """
102 |     if identifier.endswith("}}"):
103 |         return True 
104 |     else: 
105 |         return False 
106 | 
107 | 
108 | def get_update_content(m, note_file, pdfs_path, proxy):
109 |     
110 |     replace_dict = dict()
111 |     for literature in tqdm(m):
112 |         pdf = classify_identifier(literature)
113 |         
114 |         literature_id = literature.split('{')[-1].split('}')[0]
115 |         bib = get_paper_info_from_paperid(literature_id, proxy=proxy)
116 |         
117 |         try:
118 |             pdf_name = '_'.join(bib['title'].split(' ')) + '.pdf'
119 |             # rep specific symbol with '_'
120 |             pdf_name = re.sub(r"[<>:\"/\\|?*\n\r\x00-\x1F\x7F']", '_', pdf_name)
121 |             pdf_path = os.path.join(pdfs_path, pdf_name)
122 |             
123 |             if pdf:
124 |                 if not os.path.exists(pdf_path):
125 |                     get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy)
126 |                     if not os.path.exists(pdf_path):
127 |                         get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy)
128 | 
129 |             if os.path.exists(pdf_path):
130 |                 replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, ([pdf]({}))([link]({})).".format(
131 |                                     bib['title'], bib["author"].split(" and ")[0], bib['journal'], 
132 |                                     bib['year'], os.path.relpath(pdf_path, note_file).split('/',1)[-1], 
133 |                                     bib['url'])
134 |             else:
135 |                 replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, ([link]({})).".format(
136 |                                     bib['title'], bib["author"].split(" and ")[0], bib['journal'], 
137 |                                     bib['year'], bib['url']
138 |                                     )
139 |             replace_dict[literature] = replaced_literature
140 |         except:
141 |             logger.info("文献下载失败，已经跳过 {}".format(literature_id))
142 |         
143 |     return replace_dict 


--------------------------------------------------------------------------------
/doc/autolter_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ec0def0a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# autoliter example\n",
  9 |     "该文件仅作演示使用，文中所有命令去掉`!`都是`bash`命令。比如`!pip list`在jupyter notebook中 == `pip list`在终端。\n",
 10 |     "\n",
 11 |     "## 准备\n",
 12 |     "### 安装 autoliter"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "id": "96404bec",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "autoliter                     0.1.2       /Users/wilmer/E/code/autoLiterature\r\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "! pip install autoliter \n",
 31 |     "! pip list | grep autoliter"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "id": "4ef141af",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### 代理环境\n",
 40 |     "由于arxiv,sci-hub等网站在国内可能连接不上，所以最好有一个可以连接外网的代理。\n",
 41 |     "- 如果个人PC用的clash软件，默认的端口是`7890`,那么我们就可以通过`127.0.0.1:7890`来使用autoliter\n",
 42 |     "- 如果用的其他服务的代理，找到服务器的代理端口，可以通过`服务器Ip:服务器Port`来使用autoliter\n",
 43 |     "\n",
 44 |     "测试一下代理环境:\n",
 45 |     "\n",
 46 |     "自行确定代理端口以及ip地址，然后浏览器查看能否连通google.com, 确保无误后再接着往下走。\n",
 47 |     "\n",
 48 |     "### 文档准备\n",
 49 |     "在目录下使用`note_example.md`作为笔记,那么`./`文件夹路径就可以认为是所有笔记的总文件夹路径。"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "id": "400ecc5d",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "- {10.1038/s41592-022-01549-5}\r\n",
 63 |       "  - 文献Meta信息\r\n",
 64 |       "\r\n",
 65 |       "- {{2208.06175}}\r\n",
 66 |       "  - 文献Meta信息和PDF文件\r\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "def note_init():\n",
 72 |     "    # # 增加一个仅下载文献Meta信息，不下载文献PDF的标识 - {*}\n",
 73 |     "    # !echo -e \"- {10.1038/s41592-022-01549-5}\\n  - 文献Meta信息\\n\" > note_example.md\n",
 74 |     "    # # 再追加一个既下载Meta信息，又下载PDF文件的标识 - {{*}}\n",
 75 |     "    # !echo -e \"- {{2208.06175}}\\n  - 文献Meta信息和PDF文件\" >> note_example.md\n",
 76 |     "    with open(\"note_example.md\", 'w', encoding=\"UTF-8\") as f:\n",
 77 |     "        f.write(\"- {10.1038/s41592-022-01549-5}\\n  - 文献Meta信息\\n\\n\")\n",
 78 |     "        f.write(\"- {{2208.06175}}\\n  - 文献Meta信息和PDF文件\\n\")\n",
 79 |     "\n",
 80 |     "    # 查看一下 note_example.md 中的内容\n",
 81 |     "    !cat note_example.md\n",
 82 |     "note_init()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "4177cc3f",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## autoliter使用\n",
 91 |     "在准备好以上工作之后,就可以演示如何使用`autoliter`了。当然，以上准备工作你也可以用其他方式完成。\n",
 92 |     "\n",
 93 |     "### 常规使用\n",
 94 |     "#### 下载更新文献笔记"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 3,
100 |    "id": "74b0cf75",
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "INFO:AutoLiter:正在更新文件 note_example.md\n",
108 |       "INFO:AutoLiter:需要下载的文献个数 -  2\n",
109 |       "100%|█████████████████████████████████████████████| 2/2 [00:17<00:00,  8.79s/it]\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "# 更新 note_example.md\n",
115 |     "!autoliter -i note_example.md -o pdfs -p \"127.0.0.1:7890\""
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 4,
121 |    "id": "ca08aef3",
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "- **Functional ultrasound localization microscopy reveals brain-wide neurovascular activity on a microscopic scale**. Renaudin Noémi et.al. **Nat Methods**, **2022-8**, ([link](http://dx.doi.org/10.1038/s41592-022-01549-5)).\r\n",
129 |       "  - 文献Meta信息\r\n",
130 |       "\r\n",
131 |       "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n",
132 |       "  - 文献Meta信息和PDF文件\r\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "# 查看更新的笔记\n",
138 |     "!cat note_example.md"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 5,
144 |    "id": "7d9fc63b",
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf\r\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "# 查看下载的PDF文件\n",
157 |     "!ls pdfs"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "id": "c1cdf12d",
163 |    "metadata": {},
164 |    "source": [
165 |     "#### -d 的使用"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 6,
171 |    "id": "00b68ddc",
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "\r\n"
179 |      ]
180 |     }
181 |    ],
182 |    "source": [
183 |     "# 初始化文献笔记，这时候文献内pdf链接已经被删除了，但是PDF文件还未删除。可以通过-d来同步\n",
184 |     "## 先初始化笔记为空\n",
185 |     "!echo \"\" > note_example.md\n",
186 |     "!cat note_example.md"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "id": "d96e5914",
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "INFO:AutoLiter:正在更新文件 ./note_example.md\r\n",
200 |       "INFO:AutoLiter:需要下载的文献个数 -  0\r\n",
201 |       "INFO:AutoLiter:未找到需要下载的文献, 文件 ./note_example.md 未更新.\r\n",
202 |       "INFO:AutoLiter:若要删除笔记无关PDF实体, 输入的路径必须是笔记总文件夹!!!请谨慎使用该参数!!!\r\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "## 再通过 -m 更新 (更新的时候，怕误删其他笔记里有用的链接，因此-i必须是note总文件夹路径)\n",
208 |     "!autoliter -i ./note_example.md -o pdfs -p \"127.0.0.1:7890\" -d"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 8,
214 |    "id": "e8224635",
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "INFO:AutoLiter:正在更新文件 ./note_example.md\r\n",
222 |       "INFO:AutoLiter:需要下载的文献个数 -  0\r\n",
223 |       "INFO:AutoLiter:未找到需要下载的文献, 文件 ./note_example.md 未更新.\r\n",
224 |       "INFO:AutoLiter:已删除 1 个PDF文件\r\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "!autoliter -i ./ -o pdfs -d"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "id": "5fe845ae",
235 |    "metadata": {},
236 |    "source": [
237 |     "### 文件迁移\n",
238 |     "当移动PDF文件夹或者note文件的位置后，note文件内容中关于pdf的链接就变的不可用了。这时候可以用`-m`来解决这个问题\n",
239 |     "\n",
240 |     "首先先下载几个PDF文件"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 9,
246 |    "id": "de62c125",
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "- {10.1038/s41592-022-01549-5}\n",
254 |       "  - 文献Meta信息\n",
255 |       "\n",
256 |       "- {{2208.06175}}\n",
257 |       "  - 文献Meta信息和PDF文件\n",
258 |       "INFO:AutoLiter:正在更新文件 ./note_example.md\n",
259 |       "INFO:AutoLiter:需要下载的文献个数 -  2\n",
260 |       "100%|█████████████████████████████████████████████| 2/2 [00:17<00:00,  8.84s/it]\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "note_init()\n",
266 |     "!autoliter -i ./ -o pdfs/ -p \"127.0.0.1:7890\""
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 10,
272 |    "id": "5251e0fe",
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "# 查看笔记中的pdf路径\n",
285 |     "!cat note_example.md | grep pdf"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 11,
291 |    "id": "a2285b64",
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "INFO:AutoLiter:共匹配到 - 1 - 个PDF文件\r\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "# 移动PDF文件夹\n",
304 |     "!mv pdfs/ movedPdfs\n",
305 |     "\n",
306 |     "# 然后重新链接笔记和 movedPdfs文件夹\n",
307 |     "!autoliter -i ./ -m movedPdfs"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 12,
313 |    "id": "114d6219",
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](movedPdfs/pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n"
321 |      ]
322 |     }
323 |    ],
324 |    "source": [
325 |     "# 查看从新开始链接的文件\n",
326 |     "!cat note_example.md | grep pdf"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "id": "92518814",
332 |    "metadata": {},
333 |    "source": [
334 |     "## 其它\n",
335 |     "关于`-p`代理这个参数，如果人在国外，每次都不用使用，是最方便的。\n",
336 |     "\n",
337 |     "不然可以在.zashrc (macos) 文件中写入\n",
338 |     "```\n",
339 |     "# add proxy\n",
340 |     "alias setproxy=\"export http_proxy=http://127.0.0.1:7890; export https_proxy=http://127.0.0.1:7890\"\n",
341 |     "alias unsetproxy=\"unset http_proxy; unset https_proxy\"\n",
342 |     "# add proxy\n",
343 |     "```\n",
344 |     "这样每次可以在使用`autoliter`之前使用`setproxy`使终端http走代理。然后`autoliter`中`-p`就不用每次都加了。\n",
345 |     "\n",
346 |     "比如\n",
347 |     "```bash \n",
348 |     "setproxy\n",
349 |     "autoliter -i ./ -o pdfs\n",
350 |     "```"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "id": "0f7344ac",
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": []
360 |   }
361 |  ],
362 |  "metadata": {
363 |   "kernelspec": {
364 |    "display_name": "Python 3 (ipykernel)",
365 |    "language": "python",
366 |    "name": "python3"
367 |   },
368 |   "language_info": {
369 |    "codemirror_mode": {
370 |     "name": "ipython",
371 |     "version": 3
372 |    },
373 |    "file_extension": ".py",
374 |    "mimetype": "text/x-python",
375 |    "name": "python",
376 |    "nbconvert_exporter": "python",
377 |    "pygments_lexer": "ipython3",
378 |    "version": "3.8.13"
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 5
383 | }
384 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.11.1
2 | feedparser>=6.0.10
3 | urllib3>=1.26.11
4 | requests>=2.28.1
5 | tqdm>=4.64.0
6 | Unidecode>=1.3.4
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages 
 2 | 
 3 | with open('README.md', 'r', encoding='UTF-8') as f:
 4 |     README_MD = f.read()
 5 | 
 6 | setup(
 7 |     name="autoliter",
 8 |     version="0.1.3",
 9 |     description=" Helps you manage your literature notes",
10 |     long_description=README_MD,
11 |     long_description_content_type='text/markdown',
12 |     url="https://github.com/WilmerWang/autoLiterature",
13 |     classifiers=[
14 |         "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
15 |         "Intended Audience :: Science/Research",
16 |         "Programming Language :: Python :: 3",
17 |         "Topic :: Text Processing :: Markup",
18 |     ],
19 |     install_requires=["beautifulsoup4>=4.11.1", "feedparser>=6.0.10", 
20 |                       "urllib3>=1.26.11","requests>=2.28.1", 
21 |                       "tqdm>=4.64.0", "Unidecode>=1.3.4"],
22 |     entry_points={
23 |         "console_scripts": [
24 |             "autoliter = autoliterature.autoliter:main",
25 |         ]
26 |     },
27 |     packages=find_packages(),
28 |     license="AGPLv3",
29 |     author="Wilmer Wang",
30 |     author_email="wangwei0206@foxmail.com",
31 |     download_url="https://github.com/WilmerWang/autoLiterature/archive/refs/tags/v0.1.3.tar.gz",
32 |     keywords=["bibtex", "arxiv", "doi", "science", "scientific-journals"],
33 | )
34 | 


--------------------------------------------------------------------------------