├── README.md ├── autoliterature ├── __init__.py ├── arxiv.py ├── autoliter.py ├── crossref.py ├── downloads.py ├── medbiorxiv.py ├── pdfs.py └── utils.py ├── doc └── autolter_example.ipynb ├── requirements.txt └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # autoLiterature 2 | **autoLiterature**是一个基于Python的自动文献管理命令行工具。Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4). 3 | 4 | 5 | **识别规则:** 6 | - 自动识别 `- {xxx}`。 7 | - 当笔记文件中包含`- {paper_id}`时候,会下载该文献的信息,**不下载PDF**。 8 | - 当笔记文件中包含`- {{paper_id}}`时候,会下载该文献的信息,以及PDF。 9 | 10 | 注意:`paper_id`支持已发表文章的`doi`,预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。 11 | 12 | ## 安装 13 | 1. pip 安装 14 | ```bash 15 | pip install autoliter 16 | 或者 17 | pip3 install autoliter 18 | ``` 19 | 20 | 2. 源码安装 21 | ```bash 22 | git clone https://github.com/WilmerWang/autoLiterature.git 23 | cd autoLiterature 24 | python setup.py install 25 | ``` 26 | 27 | ### 软件参数 28 | ```bash 29 | autolter 30 | 31 | optional arguments: 32 | -h, --help show this help message and exit 33 | -i INPUT, --input INPUT 34 | The path to the note file or note file folder. 35 | -o OUTPUT, --output OUTPUT 36 | Folder path to save paper pdfs and iamges. NOTE: MUST BE FOLDER 37 | -p PROXY, --proxy PROXY 38 | The proxy. e.g. 127.0.0.1:1080 39 | -d, --delete Delete unreferenced attachments in notes. Use with caution, 40 | when used, -i must be a folder path including all notes 41 | -m MIGRATION, --migration MIGRATION 42 | the pdf folder path you want to reconnect to 43 | ``` 44 | 45 | ## 使用 46 | ### 基本使用 47 | 假设`input`为文献笔记(md文件)的文件夹路径,`output`为要保存PDF的文件夹路径。 48 | 49 | ```bash 50 | # 更新input文件夹下所有md文件 51 | autoliter -i input -o output 52 | 53 | # 仅更新input/example.md文件 54 | autoliter -i input/example.md -o output 55 | 56 | # -d 是个可选项,当 -i 是文件夹路径时候,使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件 57 | autoliter -i input -o output -d 58 | ``` 59 | 60 | ### 迁移笔记和PDF文件 61 | 当要移动文献笔记或者PDF文件夹的时候,文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。 62 | 63 | ```bash 64 | # 更新input文件夹下所有md文件 65 | autoliter -i input -m movedPDFs/ 66 | 67 | # 仅更新input/example.md文件 68 | autoliter -i input/example.md -m movedPDFs/ 69 | ``` 70 | 71 | 更多可以本地查看[jupyter note](doc/autolter_example.ipynb),或者在线查看[github](https://github.com/WilmerWang/autoLiterature.git) doc文件夹。 72 | 73 | ## License 74 | MIT -------------------------------------------------------------------------------- /autoliterature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilmerwang/autoLiterature/22d0983b2693c8fc878308dab6358282e1023c58/autoliterature/__init__.py -------------------------------------------------------------------------------- /autoliterature/arxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from urllib.request import ProxyHandler 4 | import feedparser 5 | try: 6 | from urllib import quote 7 | except ImportError: 8 | from urllib.parse import quote 9 | from unidecode import unidecode 10 | 11 | from .crossref import crossrefInfo 12 | 13 | 14 | logging.basicConfig() 15 | logger = logging.getLogger('arxiv') 16 | logger.setLevel(logging.DEBUG) 17 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 18 | 19 | class arxivInfo(object): 20 | def __init__(self): 21 | self.base_url = "http://export.arxiv.org/api/query" 22 | 23 | def set_proxy_handler(self, proxy): 24 | """set proxy handler 25 | 26 | Aargs: 27 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 28 | 29 | Returns: 30 | A proxy handler object. 31 | """ 32 | proxy_handler = ProxyHandler({"http": f"http://{proxy}", 33 | "https": f"https://{proxy}"}) 34 | return proxy_handler 35 | 36 | 37 | def extract_json_info(self, item): 38 | """Extract bib json information from requests.get().json() 39 | 40 | Args: 41 | item (json object): obtained by requests.get().json() 42 | 43 | Returns: 44 | A dict containing the paper information. 45 | """ 46 | paper_url = item.link 47 | title = item.title 48 | journal = "arxiv" 49 | published = item.published.split("-") 50 | if len(published) > 1: 51 | year = published[0] 52 | else: 53 | year = ' ' 54 | 55 | authors = item.authors 56 | if len(authors) > 0: 57 | first_author = authors[0]["name"].split(" ") 58 | authors = " and ".join([author["name"] for author in authors]) 59 | else: 60 | first_author = authors 61 | authors = authors 62 | 63 | bib_dict = { 64 | "title": title, 65 | "author": authors, 66 | "journal": journal, 67 | "year": year, 68 | "url": paper_url, 69 | "pdf_link": item.link.replace("abs", "pdf")+".pdf", 70 | "cited_count": None 71 | } 72 | 73 | return bib_dict 74 | 75 | 76 | def get_info_by_arxivid(self, arxivId, handler=False): 77 | """Get the meta information by the given paper arxiv_id. 78 | 79 | Args: 80 | doi (str): The arxiv Id 81 | handler (handler object): use proxy 82 | 83 | Returns: 84 | A dict containing the paper information. 85 | { 86 | "title": xxx, 87 | "author": xxx, 88 | "journal": xxx, 89 | etc 90 | } 91 | OR 92 | None 93 | """ 94 | 95 | params = "?search_query=id:"+quote(unidecode(arxivId)) 96 | 97 | try: 98 | if handler: 99 | result = feedparser.parse(self.base_url + params, handlers=[handler]) 100 | else: 101 | result = feedparser.parse(self.base_url + params) 102 | items = result.entries 103 | 104 | item = items[0] 105 | if "arxiv_doi" in item: 106 | doi = item["arxiv_doi"] 107 | 108 | crossref_info = crossrefInfo() 109 | if handler: 110 | crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1]) 111 | return crossref_info.get_info_by_doi(doi) 112 | else: 113 | return self.extract_json_info(item) 114 | except: 115 | logger.error("DOI: {} is error.".format(arxivId)) 116 | 117 | 118 | def get_info_by_title(self, title, field='ti'): 119 | """Get the meta information by the given paper title. 120 | 121 | Args: 122 | doi (str): The paper title 123 | 124 | Returns: 125 | A dict containing the paper information. 126 | { 127 | "title": xxx, 128 | "author": xxx, 129 | "journal": xxx, 130 | etc 131 | } 132 | OR 133 | None 134 | OR 135 | A list [{}, {}, {}] 136 | """ 137 | params = "?search_query="+field+":"+quote(unidecode(title)) 138 | url = self.base_url + params 139 | try: 140 | result = feedparser.parse(url) 141 | items = result.entries 142 | print(len(items)) 143 | 144 | for i, item in enumerate(items): 145 | 146 | title_item = item.title 147 | try: 148 | title_item = title_item.decode("utf-8") 149 | except: 150 | pass 151 | 152 | item.title = title_item 153 | 154 | if title_item.lower() == title.lower(): 155 | return self.extract_json_info(item) 156 | 157 | items[i] = item 158 | 159 | return [self.extract_json_info(it) for it in items] 160 | except: 161 | logger.error("Title: {} is error.".format(title)) 162 | 163 | 164 | if __name__ == "__main__": 165 | arxivId = "2208.05623" 166 | title = "Heterogeneous Graph Attention Network" 167 | 168 | arxiv_info = arxivInfo() 169 | arxiv_info.set_proxy_handler(proxy="127.0.1:1123") 170 | 171 | bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId) 172 | # bib_title = arxiv_info.get_info_by_title(title) 173 | 174 | print(bib_arxiv) 175 | print("\n") 176 | # print(bib_title) -------------------------------------------------------------------------------- /autoliterature/autoliter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | import os 4 | 5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('AutoLiter') 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | def set_args(): 13 | parser = argparse.ArgumentParser(description='autoLiterature') 14 | parser.add_argument('-i', '--input', required=True, type=str, default=None, 15 | help="The path to the note file or note file folder.") 16 | parser.add_argument('-o', '--output', type=str, default=None, 17 | help='Folder path to save paper pdfs and iamges. NOTE: MUST BE FOLDER') 18 | parser.add_argument('-p', '--proxy', type=str, default=None, 19 | help='The proxy. e.g. 127.0.0.1:1080') 20 | parser.add_argument('-d', '--delete', action='store_true', 21 | help='Delete unreferenced attachments in notes. Use with caution, ' 22 | 'when used, -i must be a folder path including all notes') 23 | parser.add_argument('-m', '--migration', type=str, default=None, 24 | help="the pdf folder path you want to reconnect to") 25 | args = parser.parse_args() 26 | 27 | return args 28 | 29 | def check_args(): 30 | args = set_args() 31 | input_path = args.input 32 | output_path = args.output 33 | delete_bool = args.delete 34 | migration_path = args.migration 35 | proxy = args.proxy 36 | 37 | return input_path, output_path, delete_bool, proxy, migration_path 38 | 39 | 40 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer): 41 | 42 | pdfs_path = output_path 43 | if not os.path.exists(pdfs_path): 44 | os.makedirs(pdfs_path) 45 | 46 | with open(note_file, 'r') as f: 47 | content = f.read() 48 | 49 | m = paper_recognizer.findall(content) 50 | logger.info("需要下载的文献个数 - {}".format(len(m))) 51 | 52 | if not m: 53 | logger.info("未找到需要下载的文献, 文件 {} 未更新.".format(note_file)) 54 | else: 55 | # TODO add pd_online link in note file 56 | replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy) 57 | 58 | return replace_dict 59 | 60 | 61 | def file_update(input_path, output_path, proxy, paper_recognizer): 62 | 63 | replace_dict = get_bib_and_pdf(input_path, output_path, 64 | proxy, paper_recognizer) 65 | 66 | if replace_dict: 67 | note_modified(paper_recognizer, input_path, **replace_dict) 68 | 69 | 70 | def main(): 71 | input_path, output_path, delete_bool, proxy, migration_path = check_args() 72 | 73 | if output_path: 74 | paper_recognizer = patternRecognizer(r'- \{.{3,}\}') 75 | 76 | if os.path.isfile(input_path): 77 | logger.info("正在更新文件 {}".format(input_path)) 78 | file_update(input_path, output_path, proxy, paper_recognizer) 79 | 80 | elif os.path.isdir(input_path): 81 | note_paths = [] 82 | for root, _, files in os.walk(input_path): 83 | for file in files: 84 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 85 | note_paths.append(os.path.join(root, file)) 86 | for note_path in note_paths: 87 | logger.info("正在更新文件 {}".format(note_path)) 88 | file_update(note_path, output_path, proxy, paper_recognizer) 89 | else: 90 | logger.info("input path {} is not exists".format(input_path)) 91 | 92 | 93 | # Delete unreferenced attachments 94 | if delete_bool: 95 | if os.path.isfile(input_path): 96 | logger.info("若要删除笔记无关PDF实体, 输入的路径必须是笔记总文件夹!!!请谨慎使用该参数!!!") 97 | else: 98 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 99 | pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer) 100 | pdf_paths = get_pdf_paths(output_path) 101 | # TODO mac 和 win 之间路径可能会不同,“/” 和 “\\” 102 | pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes] 103 | pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths] 104 | 105 | removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes)) 106 | try: 107 | for pdf_p in removed_pdf_paths: 108 | os.remove(pdf_p) 109 | except: 110 | pass 111 | 112 | logger.info("已删除 {} 个PDF文件".format(len(removed_pdf_paths))) 113 | 114 | 115 | if migration_path: 116 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 117 | 118 | pdf_paths = get_pdf_paths(migration_path) 119 | pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer) 120 | 121 | # match based on paper title 122 | matched_numb = 0 123 | pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths} 124 | for md_file, pdf_paths_ in pdf_paths_in_notes.items(): 125 | 126 | pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_} 127 | matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys() 128 | 129 | matched_numb += len(matched_pdfs) 130 | 131 | # os.path.relpath(pdf_path, note_file).split('/',1)[-1] 132 | replace_paths_dict = {} 133 | for matched in matched_pdfs: 134 | replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1] 135 | replaced_str = "[pdf]({})".format(replaced_str) 136 | ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched]) 137 | replace_paths_dict[ori_str] = replaced_str 138 | 139 | if replace_paths_dict: 140 | note_modified(pdf_path_recognizer, md_file, **replace_paths_dict) 141 | 142 | logger.info("共匹配到 - {} - 个PDF文件".format(matched_numb)) 143 | 144 | 145 | if not output_path and not migration_path: 146 | logger.info("缺少关键参数 -o 或者 -m, 程序未运行, 请使用 -h 查看具体信息") 147 | 148 | 149 | if __name__ == "__main__": 150 | main() -------------------------------------------------------------------------------- /autoliterature/crossref.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | # 4 | # 1. get info by doi 5 | # 2. get info by title 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('crossref') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class crossrefInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "http://api.crossref.org/" 17 | 18 | def set_proxy(self, proxy=None): 19 | """set proxy for session 20 | 21 | Args: 22 | proxy (str): The proxy adress. e.g 127.0.1:1123 23 | Returns: 24 | None 25 | """ 26 | if proxy: 27 | self.sess.proxies = { 28 | "http": proxy, 29 | "https": proxy, } 30 | 31 | 32 | def extract_json_info(self, bib): 33 | """Extract bib json information from requests.get().json() 34 | 35 | Args: 36 | bib (json object): obtained by requests.get().json() 37 | 38 | Returns: 39 | A dict containing the paper information. 40 | """ 41 | pub_date = [str(i) for i in bib['published']["date-parts"][0]] 42 | pub_date = '-'.join(pub_date) 43 | 44 | if 'author' in bib.keys(): 45 | authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()]) 46 | else: 47 | authors = "No author" 48 | 49 | if 'short-container-title' in bib.keys(): 50 | try: 51 | journal = bib['short-container-title'][0] 52 | except: 53 | journal = "No journal" 54 | else: 55 | try: 56 | journal = bib['container-title'][0] 57 | except: 58 | journal = "No journal" 59 | 60 | bib_dict = { 61 | "title": bib['title'][0], 62 | "author": authors, 63 | "journal": journal, 64 | "year": pub_date, 65 | "url": bib["URL"], 66 | "pdf_link": bib["link"][0]["URL"], 67 | "cited_count": bib["is-referenced-by-count"] 68 | } 69 | 70 | return bib_dict 71 | 72 | 73 | def get_info_by_doi(self, doi): 74 | """Get the meta information by the given paper DOI number. 75 | 76 | Args: 77 | doi (str): The paper DOI number 78 | 79 | Returns: 80 | A dict containing the paper information. 81 | { 82 | "title": xxx, 83 | "author": xxx, 84 | "journal": xxx, 85 | etc 86 | } 87 | OR 88 | None 89 | """ 90 | url = "{}works/{}" 91 | url = url.format(self.base_url, doi) 92 | 93 | try: 94 | r = self.sess.get(url) 95 | 96 | bib = r.json()['message'] 97 | return self.extract_json_info(bib) 98 | 99 | except: 100 | logger.error("DOI: {} is error.".format(doi)) 101 | 102 | 103 | def get_info_by_title(self, title): 104 | """Get the meta information by the given paper title. 105 | 106 | Args: 107 | doi (str): The paper title 108 | 109 | Returns: 110 | A dict containing the paper information. 111 | { 112 | "title": xxx, 113 | "author": xxx, 114 | "journal": xxx, 115 | etc 116 | } 117 | OR 118 | None 119 | OR 120 | A list [{}, {}, {}] 121 | """ 122 | url = self.base_url + "works" 123 | params = {"query.bibliographic": title, "rows": 20} 124 | try: 125 | r = self.sess.get(url, params=params) 126 | items = r.json()["message"]["items"] 127 | 128 | for i, item in enumerate(items): 129 | 130 | title_item = item['title'][0] 131 | try: 132 | title_item = title_item.decode("utf-8") 133 | except: 134 | pass 135 | 136 | item["title"][0] = title_item 137 | 138 | if title_item.lower() == title.lower(): 139 | return self.extract_json_info(item) 140 | 141 | items[i] = item 142 | 143 | return [self.extract_json_info(it) for it in items] 144 | except: 145 | logger.error("Title: {} is error.".format(title)) 146 | 147 | 148 | if __name__ == "__main__": 149 | # doi = "10.1016/j.wneu.2012.11.074" 150 | # doi = "10.1093/cercor/bhac266" 151 | doi = "10.1038/s41467-022-29269-6" 152 | # title = "Heterogeneous Graph Attention Network" 153 | # title = "Learning to Copy Coherent Knowledge for Response Generation" 154 | 155 | crossref_info = crossrefInfo() 156 | crossref_info.set_proxy(proxy="127.0.1:1123") 157 | 158 | bib_doi = crossref_info.get_info_by_doi(doi) 159 | # bib_title = crossref_info.get_info_by_title(title) 160 | 161 | print(bib_doi) 162 | print("\n") 163 | # print(bib_title) 164 | -------------------------------------------------------------------------------- /autoliterature/downloads.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import os 4 | 5 | from .arxiv import arxivInfo 6 | from .crossref import crossrefInfo 7 | from .medbiorxiv import BMxivInfo 8 | from .pdfs import pdfDownload 9 | 10 | # log config 11 | logging.basicConfig() 12 | logger = logging.getLogger('Downloads') 13 | logger.setLevel(logging.INFO) 14 | 15 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 16 | 17 | 18 | 19 | def check_string(re_exp, str): 20 | res = re.match(re_exp, str) 21 | if res: 22 | return True 23 | else: 24 | return False 25 | 26 | def classify(identifier): 27 | """ 28 | Classify the type of paper_id: 29 | arxivId - arxivId 30 | doi - digital object identifier 31 | medbiorxivId - medrxiv or biorxiv id 32 | title - title 33 | """ 34 | if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier): 35 | return 'doi' 36 | elif check_string(r'10\.1101/\.*', identifier): 37 | return "medbiorxivId" 38 | elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier): 39 | return 'arxivId' 40 | elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier): 41 | return 'title' 42 | else: 43 | return "unrecognized" 44 | 45 | def get_paper_info_from_paperid(paper_id, proxy=None): 46 | id_type = classify(paper_id) 47 | 48 | if id_type == "doi": 49 | downloader = crossrefInfo() 50 | if proxy: 51 | downloader.set_proxy(proxy=proxy) 52 | bib_dict = downloader.get_info_by_doi(paper_id) 53 | 54 | elif id_type == "arxivId": 55 | downloader = arxivInfo() 56 | if proxy: 57 | downloader.set_proxy_handler(proxy=proxy) 58 | bib_dict = downloader.get_info_by_arxivid(paper_id) 59 | 60 | elif id_type == "medbiorxivId": 61 | downloader = BMxivInfo() 62 | if proxy: 63 | downloader.set_proxy(proxy=proxy) 64 | bib_dict = downloader.get_info_by_bmrxivid(paper_id) 65 | 66 | elif id_type == "title": 67 | pass 68 | else: 69 | pass 70 | 71 | try: 72 | return bib_dict 73 | except: 74 | pass 75 | 76 | 77 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None): 78 | pdf_downloader = pdfDownload() 79 | if proxy: 80 | pdf_downloader.set_proxy(proxy=proxy) 81 | 82 | if direct_url: 83 | content = pdf_downloader.get_pdf_from_direct_url(direct_url) 84 | if not content: 85 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 86 | else: 87 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 88 | 89 | try: 90 | if not os.path.exists(path.rsplit("/", 1)[0]): 91 | os.makedirs(path.rsplit("/", 1)[0]) 92 | pdf_downloader._save(content['pdf'], path) 93 | except: 94 | pass 95 | 96 | 97 | 98 | 99 | if __name__ == "__main__": 100 | doi = "10.1016/j.wneu.2012.11.074" 101 | arxiv_id = "2208.05623" 102 | medbiorxiv_id = "10.1101/2022.07.28.22277637" 103 | undefine_name = "sjsldjfnadijjsl;kjdjf" 104 | 105 | print(get_paper_info_from_paperid(doi)) 106 | print(get_paper_info_from_paperid(arxiv_id)) 107 | print(get_paper_info_from_paperid(medbiorxiv_id)) 108 | print(get_paper_info_from_paperid(undefine_name)) -------------------------------------------------------------------------------- /autoliterature/medbiorxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | from .crossref import crossrefInfo 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('biorxiv') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class BMxivInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "https://api.biorxiv.org/details/" 17 | self.servers = ["biorxiv", "medrxiv"] 18 | 19 | 20 | def set_proxy(self, proxy=False): 21 | """set proxy for session 22 | 23 | Args: 24 | proxy (str): The proxy adress. e.g 127.0.1:1123 25 | Returns: 26 | None 27 | """ 28 | if proxy: 29 | self.sess.proxies = { 30 | "http": proxy, 31 | "https": proxy, } 32 | 33 | 34 | def extract_json_info(self, item): 35 | """Extract bib json information from requests.get().json() 36 | 37 | Args: 38 | item (json object): obtained by requests.get().json() 39 | 40 | Returns: 41 | A dict containing the paper information. 42 | """ 43 | paper_url = f"https://www.biorxiv.org/content/{item['doi']}" 44 | title = item["title"] 45 | journal = item["server"] 46 | published = item["date"].split('-') 47 | if len(published) > 1: 48 | year = published[0] 49 | else: 50 | year = ' ' 51 | 52 | authors = item['authors'].split("; ") 53 | if len(authors) > 0: 54 | authors = " and ".join([author for author in authors]) 55 | else: 56 | authors = authors 57 | 58 | bib_dict = { 59 | "title": title, 60 | "author": authors, 61 | "journal": journal, 62 | "year": year, 63 | "url": paper_url, 64 | "pdf_link": f"{paper_url}.full.pdf", 65 | "cited_count": None 66 | } 67 | 68 | return bib_dict 69 | 70 | 71 | def get_info_by_bmrxivid(self, bmrxivid): 72 | """Get the meta information by the given paper biorxiv_id or medrxiv_id. 73 | 74 | Args: 75 | doi (str): The biorxiv or medrxiv Id 76 | 77 | Returns: 78 | A dict containing the paper information. 79 | { 80 | "title": xxx, 81 | "author": xxx, 82 | "journal": xxx, 83 | etc 84 | } 85 | OR 86 | None 87 | """ 88 | urls = [self.base_url + server + "/" + bmrxivid for server in self.servers] 89 | for url in urls: 90 | try: 91 | r = self.sess.get(url) 92 | 93 | bib = r.json()['collection'][-1] 94 | 95 | if "published" in bib.keys() and bib['published'] != "NA": 96 | doi = bib["published"] 97 | print(doi) 98 | crossref_info = crossrefInfo() 99 | if len(self.sess.proxies) > 0: 100 | crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1]) 101 | return crossref_info.get_info_by_doi(doi) 102 | 103 | return self.extract_json_info(bib) 104 | 105 | except: 106 | logger.error("DOI: {} is error.".format(bmrxivid)) 107 | 108 | 109 | def get_info_by_title(self, title): 110 | """Get the meta information by the given paper title. 111 | 112 | Args: 113 | doi (str): The paper title 114 | 115 | Returns: 116 | A dict containing the paper information. 117 | { 118 | "title": xxx, 119 | "author": xxx, 120 | "journal": xxx, 121 | etc 122 | } 123 | OR 124 | None 125 | OR 126 | A list [{}, {}, {}] 127 | """ 128 | base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard" 129 | query = title.replace(' ', '%252B') 130 | 131 | url = base_url.format(query) 132 | try: 133 | result = self.sess.get(url) 134 | soup = BeautifulSoup(result.content, "lxml") 135 | soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix") 136 | 137 | soup_dict = dict() 138 | for sp in soup_items: 139 | key = sp.find("a", class_="highwire-cite-linked-title").span.text 140 | value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "") 141 | soup_dict[key] = value 142 | 143 | for item_title, item_doi in soup_dict.items(): 144 | try: 145 | item_title = item_title.decode("utf-8") 146 | except: 147 | pass 148 | 149 | if item_title.lower() == title.lower(): 150 | return self.get_info_by_bmrxivid(item_doi) 151 | 152 | return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()] 153 | except: 154 | logger.error("Title: {} is error.".format(title)) 155 | 156 | 157 | if __name__ == "__main__": 158 | 159 | arxivId = "10.1101/2022.07.28.22277637" 160 | # title = "Oxygen restriction induces a viable but non-culturable population in bacteria" 161 | # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing" 162 | # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19" 163 | # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome" 164 | 165 | arxiv_info = BMxivInfo() 166 | arxiv_info.set_proxy(proxy="127.0.1:1123") 167 | 168 | bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId) 169 | # bib_title = arxiv_info.get_info_by_title(title) 170 | 171 | print(bib_arxiv) 172 | print("\n") 173 | # print(bib_title) -------------------------------------------------------------------------------- /autoliterature/pdfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from urllib.parse import urlunsplit, urlsplit 4 | from bs4 import BeautifulSoup 5 | 6 | logging.basicConfig() 7 | logger = logging.getLogger('PDFs') 8 | logger.setLevel(logging.DEBUG) 9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 10 | 11 | 12 | class pdfDownload(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | 17 | def set_proxy(self, proxy=None): 18 | """set proxy for session 19 | 20 | Args: 21 | proxy (str): The proxy adress. e.g 127.0.1:1123 22 | Returns: 23 | None 24 | """ 25 | if proxy: 26 | self.sess.proxies = { 27 | "http": proxy, 28 | "https": proxy, } 29 | 30 | 31 | def _get_available_scihub_urls(self): 32 | ''' 33 | Finds available scihub urls via https://lovescihub.wordpress.com/ or 34 | https://sci-hub.now.sh/ 35 | ''' 36 | urls = [] 37 | res = self.sess.get('https://lovescihub.wordpress.com/') 38 | s = BeautifulSoup(res.content, 'html.parser') 39 | for a in s.find('div', class_="entry-content").find_all('a', href=True): 40 | if 'sci-hub.' in a['href']: 41 | urls.append(a['href']) 42 | return urls 43 | 44 | 45 | def fetch(self, url, auth=None): 46 | '''Fetch pdf 47 | 48 | Args: 49 | url (str): 50 | 51 | Returns: 52 | A dict OR None 53 | ''' 54 | try: 55 | r = self.sess.get(url, auth=auth) 56 | 57 | if r.headers["Content-Type"] != "application/pdf": 58 | logger.info("Failed to fetch pdf with url: {}".format(url)) 59 | else: 60 | return { 61 | 'pdf': r.content, 62 | 'url': url 63 | } 64 | except: 65 | logger.error("Failed to open url: {}".format(url)) 66 | 67 | 68 | def get_pdf_from_direct_url(self, url, auth=None): 69 | return self.fetch(url, auth=auth) 70 | 71 | 72 | def get_pdf_from_sci_hub(self, identifier, auth=None): 73 | '''Fetch pdf from sci-hub based on doi or url 74 | 75 | Args: 76 | identifier (str): DOI or url 77 | auth (tuple): ("user", "passwd") 78 | 79 | Returns: 80 | A dict OR None 81 | ''' 82 | for base_url in self._get_available_scihub_urls(): 83 | r = self.sess.get(base_url + '/' + identifier, auth=auth) 84 | soup = BeautifulSoup(r.content, 'html.parser') 85 | 86 | pdf_div_names = ['iframe', 'embed'] 87 | for pdf_div_name in pdf_div_names: 88 | pdf_div = soup.find(pdf_div_name) 89 | if pdf_div != None: 90 | break 91 | try: 92 | url_parts = urlsplit(pdf_div.get('src')) 93 | if url_parts[1]: 94 | if url_parts[0]: 95 | pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', '')) 96 | else: 97 | pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', '')) 98 | else: 99 | pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', '')) 100 | 101 | return self.fetch(pdf_url, auth) 102 | except: 103 | pass 104 | 105 | logger.info("Failed to fetch pdf with all sci-hub urls") 106 | 107 | def _save(self, content, path): 108 | with open(path, "wb") as f: 109 | f.write(content) 110 | 111 | 112 | if __name__ == "__main__": 113 | doi = "10.1145/3308558.3313562" 114 | 115 | pdf_download = pdfDownload() 116 | pdf_download.set_proxy("127.0.1:1123") 117 | 118 | pdf_dict = pdf_download.get_pdf_from_sci_hub(doi) 119 | if pdf_dict: 120 | print(pdf_dict['url']) 121 | pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf") 122 | 123 | # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf") 124 | # if pdf_dict2: 125 | # print(pdf_dict2['url']) 126 | # pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf") 127 | 128 | -------------------------------------------------------------------------------- /autoliterature/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import re 4 | from tqdm import tqdm 5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('utils') 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | class patternRecognizer(object): 13 | def __init__(self, regular_rule): 14 | self.pattern = re.compile(regular_rule) 15 | 16 | def match(self, string): 17 | return self.pattern.match(string) 18 | 19 | def findall(self, string): 20 | return self.pattern.findall(string) 21 | 22 | def multiple_replace(self, content, **replace_dict): 23 | def replace_(value): 24 | match = value.group() 25 | if match in replace_dict.keys(): 26 | return replace_dict[match] 27 | else: 28 | return match+" **Not Correct, Check it**" 29 | 30 | replace_content = self.pattern.sub(replace_, content) 31 | 32 | return replace_content 33 | 34 | 35 | def note_modified(pattern_recog, md_file, **replace_dict): 36 | with open(md_file, 'r') as f: 37 | content = f.read() 38 | 39 | replaced_content = pattern_recog.multiple_replace(content, **replace_dict) 40 | 41 | with open(md_file, 'w') as f: 42 | f.write(''.join(replaced_content)) 43 | 44 | 45 | def get_pdf_paths(pdf_root): 46 | pdf_paths = [] 47 | for root, _, files in os.walk(pdf_root): 48 | for file in files: 49 | if file.lower().endswith('.pdf'): 50 | pdf_paths.append(os.path.join(root, file)) 51 | 52 | return pdf_paths 53 | 54 | 55 | def get_pdf_paths_from_notes(md_root, reg): 56 | 57 | md_files = [] 58 | for root, _, files in os.walk(md_root): 59 | for file in files: 60 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 61 | md_files.append(os.path.join(root, file)) 62 | 63 | pdf_paths_from_notes = [] 64 | for md_file in md_files: 65 | with open(md_file, 'r') as f: 66 | content = f.read() 67 | m = reg.findall(content) 68 | m = [i.split("(")[-1].split(')')[0] for i in m] 69 | pdf_paths_from_notes.extend(m) 70 | 71 | return pdf_paths_from_notes 72 | 73 | 74 | def get_pdf_paths_from_notes_dict(md_root, reg): 75 | pdf_paths_from_notes_dict = {} 76 | if os.path.isdir(md_root): 77 | md_files = [] 78 | for root, _, files in os.walk(md_root): 79 | for file in files: 80 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 81 | md_files.append(os.path.join(root, file)) 82 | 83 | for md_file in md_files: 84 | with open(md_file, 'r') as f: 85 | content = f.read() 86 | m = reg.findall(content) 87 | m = [i.split("(")[-1].split(')')[0] for i in m] 88 | pdf_paths_from_notes_dict[md_file] = m 89 | else: 90 | with open(md_root, 'r') as f: 91 | content = f.read() 92 | m = reg.findall(content) 93 | m = [i.split("(")[-1].split(')')[0] for i in m] 94 | pdf_paths_from_notes_dict[md_root] = m 95 | 96 | return pdf_paths_from_notes_dict 97 | 98 | 99 | def classify_identifier(identifier): 100 | """Not need to download PDF file 101 | """ 102 | if identifier.endswith("}}"): 103 | return True 104 | else: 105 | return False 106 | 107 | 108 | def get_update_content(m, note_file, pdfs_path, proxy): 109 | 110 | replace_dict = dict() 111 | for literature in tqdm(m): 112 | pdf = classify_identifier(literature) 113 | 114 | literature_id = literature.split('{')[-1].split('}')[0] 115 | bib = get_paper_info_from_paperid(literature_id, proxy=proxy) 116 | 117 | try: 118 | pdf_name = '_'.join(bib['title'].split(' ')) + '.pdf' 119 | # rep specific symbol with '_' 120 | pdf_name = re.sub(r"[<>:\"/\\|?*\n\r\x00-\x1F\x7F']", '_', pdf_name) 121 | pdf_path = os.path.join(pdfs_path, pdf_name) 122 | 123 | if pdf: 124 | if not os.path.exists(pdf_path): 125 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy) 126 | if not os.path.exists(pdf_path): 127 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy) 128 | 129 | if os.path.exists(pdf_path): 130 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, ([pdf]({}))([link]({})).".format( 131 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 132 | bib['year'], os.path.relpath(pdf_path, note_file).split('/',1)[-1], 133 | bib['url']) 134 | else: 135 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, ([link]({})).".format( 136 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 137 | bib['year'], bib['url'] 138 | ) 139 | replace_dict[literature] = replaced_literature 140 | except: 141 | logger.info("文献下载失败,已经跳过 {}".format(literature_id)) 142 | 143 | return replace_dict -------------------------------------------------------------------------------- /doc/autolter_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ec0def0a", 6 | "metadata": {}, 7 | "source": [ 8 | "# autoliter example\n", 9 | "该文件仅作演示使用,文中所有命令去掉`!`都是`bash`命令。比如`!pip list`在jupyter notebook中 == `pip list`在终端。\n", 10 | "\n", 11 | "## 准备\n", 12 | "### 安装 autoliter" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "96404bec", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "autoliter 0.1.2 /Users/wilmer/E/code/autoLiterature\r\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "! pip install autoliter \n", 31 | "! pip list | grep autoliter" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "4ef141af", 37 | "metadata": {}, 38 | "source": [ 39 | "### 代理环境\n", 40 | "由于arxiv,sci-hub等网站在国内可能连接不上,所以最好有一个可以连接外网的代理。\n", 41 | "- 如果个人PC用的clash软件,默认的端口是`7890`,那么我们就可以通过`127.0.0.1:7890`来使用autoliter\n", 42 | "- 如果用的其他服务的代理,找到服务器的代理端口,可以通过`服务器Ip:服务器Port`来使用autoliter\n", 43 | "\n", 44 | "测试一下代理环境:\n", 45 | "\n", 46 | "自行确定代理端口以及ip地址,然后浏览器查看能否连通google.com, 确保无误后再接着往下走。\n", 47 | "\n", 48 | "### 文档准备\n", 49 | "在目录下使用`note_example.md`作为笔记,那么`./`文件夹路径就可以认为是所有笔记的总文件夹路径。" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "id": "400ecc5d", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "- {10.1038/s41592-022-01549-5}\r\n", 63 | " - 文献Meta信息\r\n", 64 | "\r\n", 65 | "- {{2208.06175}}\r\n", 66 | " - 文献Meta信息和PDF文件\r\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "def note_init():\n", 72 | " # # 增加一个仅下载文献Meta信息,不下载文献PDF的标识 - {*}\n", 73 | " # !echo -e \"- {10.1038/s41592-022-01549-5}\\n - 文献Meta信息\\n\" > note_example.md\n", 74 | " # # 再追加一个既下载Meta信息,又下载PDF文件的标识 - {{*}}\n", 75 | " # !echo -e \"- {{2208.06175}}\\n - 文献Meta信息和PDF文件\" >> note_example.md\n", 76 | " with open(\"note_example.md\", 'w', encoding=\"UTF-8\") as f:\n", 77 | " f.write(\"- {10.1038/s41592-022-01549-5}\\n - 文献Meta信息\\n\\n\")\n", 78 | " f.write(\"- {{2208.06175}}\\n - 文献Meta信息和PDF文件\\n\")\n", 79 | "\n", 80 | " # 查看一下 note_example.md 中的内容\n", 81 | " !cat note_example.md\n", 82 | "note_init()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "4177cc3f", 88 | "metadata": {}, 89 | "source": [ 90 | "## autoliter使用\n", 91 | "在准备好以上工作之后,就可以演示如何使用`autoliter`了。当然,以上准备工作你也可以用其他方式完成。\n", 92 | "\n", 93 | "### 常规使用\n", 94 | "#### 下载更新文献笔记" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "id": "74b0cf75", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "INFO:AutoLiter:正在更新文件 note_example.md\n", 108 | "INFO:AutoLiter:需要下载的文献个数 - 2\n", 109 | "100%|█████████████████████████████████████████████| 2/2 [00:17<00:00, 8.79s/it]\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# 更新 note_example.md\n", 115 | "!autoliter -i note_example.md -o pdfs -p \"127.0.0.1:7890\"" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 4, 121 | "id": "ca08aef3", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "- **Functional ultrasound localization microscopy reveals brain-wide neurovascular activity on a microscopic scale**. Renaudin Noémi et.al. **Nat Methods**, **2022-8**, ([link](http://dx.doi.org/10.1038/s41592-022-01549-5)).\r\n", 129 | " - 文献Meta信息\r\n", 130 | "\r\n", 131 | "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n", 132 | " - 文献Meta信息和PDF文件\r\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# 查看更新的笔记\n", 138 | "!cat note_example.md" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "id": "7d9fc63b", 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf\r\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# 查看下载的PDF文件\n", 157 | "!ls pdfs" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "c1cdf12d", 163 | "metadata": {}, 164 | "source": [ 165 | "#### -d 的使用" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "id": "00b68ddc", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "\r\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "# 初始化文献笔记,这时候文献内pdf链接已经被删除了,但是PDF文件还未删除。可以通过-d来同步\n", 184 | "## 先初始化笔记为空\n", 185 | "!echo \"\" > note_example.md\n", 186 | "!cat note_example.md" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 7, 192 | "id": "d96e5914", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "INFO:AutoLiter:正在更新文件 ./note_example.md\r\n", 200 | "INFO:AutoLiter:需要下载的文献个数 - 0\r\n", 201 | "INFO:AutoLiter:未找到需要下载的文献, 文件 ./note_example.md 未更新.\r\n", 202 | "INFO:AutoLiter:若要删除笔记无关PDF实体, 输入的路径必须是笔记总文件夹!!!请谨慎使用该参数!!!\r\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "## 再通过 -m 更新 (更新的时候,怕误删其他笔记里有用的链接,因此-i必须是note总文件夹路径)\n", 208 | "!autoliter -i ./note_example.md -o pdfs -p \"127.0.0.1:7890\" -d" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 8, 214 | "id": "e8224635", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "INFO:AutoLiter:正在更新文件 ./note_example.md\r\n", 222 | "INFO:AutoLiter:需要下载的文献个数 - 0\r\n", 223 | "INFO:AutoLiter:未找到需要下载的文献, 文件 ./note_example.md 未更新.\r\n", 224 | "INFO:AutoLiter:已删除 1 个PDF文件\r\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "!autoliter -i ./ -o pdfs -d" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "5fe845ae", 235 | "metadata": {}, 236 | "source": [ 237 | "### 文件迁移\n", 238 | "当移动PDF文件夹或者note文件的位置后,note文件内容中关于pdf的链接就变的不可用了。这时候可以用`-m`来解决这个问题\n", 239 | "\n", 240 | "首先先下载几个PDF文件" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 9, 246 | "id": "de62c125", 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "- {10.1038/s41592-022-01549-5}\n", 254 | " - 文献Meta信息\n", 255 | "\n", 256 | "- {{2208.06175}}\n", 257 | " - 文献Meta信息和PDF文件\n", 258 | "INFO:AutoLiter:正在更新文件 ./note_example.md\n", 259 | "INFO:AutoLiter:需要下载的文献个数 - 2\n", 260 | "100%|█████████████████████████████████████████████| 2/2 [00:17<00:00, 8.84s/it]\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "note_init()\n", 266 | "!autoliter -i ./ -o pdfs/ -p \"127.0.0.1:7890\"" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 10, 272 | "id": "5251e0fe", 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "# 查看笔记中的pdf路径\n", 285 | "!cat note_example.md | grep pdf" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 11, 291 | "id": "a2285b64", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "INFO:AutoLiter:共匹配到 - 1 - 个PDF文件\r\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "# 移动PDF文件夹\n", 304 | "!mv pdfs/ movedPdfs\n", 305 | "\n", 306 | "# 然后重新链接笔记和 movedPdfs文件夹\n", 307 | "!autoliter -i ./ -m movedPdfs" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 12, 313 | "id": "114d6219", 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "- **The Weighting Game: Evaluating Quality of Explainability Methods**. Lassi Raatikainen et.al. **arxiv**, **2022**, ([pdf](movedPdfs/pdfs/The_Weighting_Game:_Evaluating_Quality_of_Explainability_Methods.pdf))([link](http://arxiv.org/abs/2208.06175v1)).\r\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# 查看从新开始链接的文件\n", 326 | "!cat note_example.md | grep pdf" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "92518814", 332 | "metadata": {}, 333 | "source": [ 334 | "## 其它\n", 335 | "关于`-p`代理这个参数,如果人在国外,每次都不用使用,是最方便的。\n", 336 | "\n", 337 | "不然可以在.zashrc (macos) 文件中写入\n", 338 | "```\n", 339 | "# add proxy\n", 340 | "alias setproxy=\"export http_proxy=http://127.0.0.1:7890; export https_proxy=http://127.0.0.1:7890\"\n", 341 | "alias unsetproxy=\"unset http_proxy; unset https_proxy\"\n", 342 | "# add proxy\n", 343 | "```\n", 344 | "这样每次可以在使用`autoliter`之前使用`setproxy`使终端http走代理。然后`autoliter`中`-p`就不用每次都加了。\n", 345 | "\n", 346 | "比如\n", 347 | "```bash \n", 348 | "setproxy\n", 349 | "autoliter -i ./ -o pdfs\n", 350 | "```" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "id": "0f7344ac", 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": "Python 3 (ipykernel)", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.8.13" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 5 383 | } 384 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.1 2 | feedparser>=6.0.10 3 | urllib3>=1.26.11 4 | requests>=2.28.1 5 | tqdm>=4.64.0 6 | Unidecode>=1.3.4 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md', 'r', encoding='UTF-8') as f: 4 | README_MD = f.read() 5 | 6 | setup( 7 | name="autoliter", 8 | version="0.1.3", 9 | description=" Helps you manage your literature notes", 10 | long_description=README_MD, 11 | long_description_content_type='text/markdown', 12 | url="https://github.com/WilmerWang/autoLiterature", 13 | classifiers=[ 14 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 15 | "Intended Audience :: Science/Research", 16 | "Programming Language :: Python :: 3", 17 | "Topic :: Text Processing :: Markup", 18 | ], 19 | install_requires=["beautifulsoup4>=4.11.1", "feedparser>=6.0.10", 20 | "urllib3>=1.26.11","requests>=2.28.1", 21 | "tqdm>=4.64.0", "Unidecode>=1.3.4"], 22 | entry_points={ 23 | "console_scripts": [ 24 | "autoliter = autoliterature.autoliter:main", 25 | ] 26 | }, 27 | packages=find_packages(), 28 | license="AGPLv3", 29 | author="Wilmer Wang", 30 | author_email="wangwei0206@foxmail.com", 31 | download_url="https://github.com/WilmerWang/autoLiterature/archive/refs/tags/v0.1.3.tar.gz", 32 | keywords=["bibtex", "arxiv", "doi", "science", "scientific-journals"], 33 | ) 34 | --------------------------------------------------------------------------------