├── .DS_Store ├── .gitignore ├── LICENSE ├── README.md ├── build ├── .DS_Store └── lib │ └── easy_literature │ ├── DBLP.py │ ├── GoogleScholar.py │ ├── Scholarly.py │ ├── __init__.py │ ├── arxiv.py │ ├── crossref.py │ ├── dblp_source.py │ ├── dlbp.py │ ├── downloads.py │ ├── easyliter.py │ ├── medbiorxiv.py │ ├── pdfs.py │ └── utils.py ├── easy_literature ├── DBLP.py ├── GoogleScholar.py ├── Scholarly.py ├── __init__.py ├── arxiv.py ├── crossref.py ├── dblp_source.py ├── downloads.py ├── easyliter.py ├── medbiorxiv.py ├── pdfs.py └── utils.py ├── easyliter.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── entry_points.txt ├── requires.txt └── top_level.txt ├── figures ├── .DS_Store └── demo.png ├── requirements.txt └── setup.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jinjie Ni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EasyLiterature 2 | **EasyLiterature** is a Python-based command line tool for automatic literature management. Welcome star or contribute! 3 | 4 | Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically `collect and refine its information in the markdown file`, `download the pdf to your local machine`, and `link the pdf to your paper in the markdown file`. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver. 5 | 6 |
7 | 8 | **A demo of the entries in your markdown note:** 9 | 10 | demo 11 | 12 |
13 | 14 | Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4), adapted from [autoLiterature](https://github.com/wilmerwang/autoLiterature). 15 | Compared to autoLiterature, **EasyLiterature** is much easier to use and supports a wider range of features, such as `title-based paper match`, `paper search and download on Google Scholar and DBLP` (the two main sites for scholars), `citation statistics`, `mannual information update assitant`, etc. **EasyLiterature covers almost all papers thanks to the support of Google Scholar and DBLP!** 16 | 17 | ___ 18 | 19 | **中文版介绍:** 20 | 21 | **EasyLiterature** 是一个基于python的命令行文件管理工具,永久开源,欢迎star或contribute。 22 | 23 | 之前沐神(李沐)做过一期视频讲如何阅读文献和整理,我觉得讲得非常好,[链接](https://www.bilibili.com/video/BV1nA41157y4)。EasyLiterature基本基于沐神所述的这一流程实现,并丰富了其他功能。 24 | 25 | 简单来说,在 Markdown 文件中简单列出想要阅读的论文标题(或ID),它会自动收集并在Markdown文件中完善相关信息,下载论文的PDF到本地机器,并将PDF链接到Markdown文件中的论文。通过这样的流程,我们可以实现永久保存实时编辑的论文PDF和Markdown中的笔记,无论是在本地机器还是云端,并且方便论文一站式分类和管理。 26 | 27 |
28 | 29 | **markdown文件中的论文信息条目(示意):** 30 | 31 | demo 32 | 33 |
34 | 35 | 与之前的实现相比,EasyLiterature兼容之前实现的所有功能,并且支持更多功能,比如:1. 基于标题的论文匹配;2. Google Scholar和DBLP(全球两大主要paper数据库)的论文搜索和下载;3. 引用统计;4. 手动信息更新助手;5. 容错搜索匹配;等等。之前的实现由于数据库的限制,很多文章都找不到。**EasyLiterature得益于增加了Google Scholar和DBLP的支持,几乎覆盖了所有论文!** 36 | 37 |

38 | 39 | ## 1. A Simple Usage Example (一个简单的使用示例) 40 | 1. Have the python installed on your local machine (preferably >= 3.7). 41 | 2. Run `pip install easyliter` in your command line to install. 42 | 3. Prepare your markdown note file (e.g., `Note.md`).
**Attention:** You may need to download a markdown editor to create/edit this file. I am using [Typora](https://typora.io/), which is not totally free. You can also choose other alternatives. 43 | 4. List the formated papers titles in your markdown note file according to the Section 4 below (Recognition Rules). e.g.,
44 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
45 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
46 | **(pay attention to the space after ‘\-’)** 47 | 5. Create a folder to store the downloaded pdfs (e.g., `PDFs/`). 48 | 6. Run `easyliter -i -o `. 49 |
(Replace `` with the actual path to your markdown note file, `` with the actual path to your pdf folder) 50 |
e.g., `easyliter -i "/home/Note.md" -o "/home/PDFs"` 51 | 7. Your should able to see that the updated information and downloaded pdf files if no error is reported. 52 | 8. This is a simple and common use case. For other features, please read the below sections carefully and follow the instructions. 53 | 54 |
55 | 56 | **中文版示例** 57 | 58 | 1. 在您的本地机器上安装 Python(版本 >= 3.7)。 59 | 2. 在命令行中运行 `pip install easyliter` 进行安装。 60 | 3. 准备您的 markdown 笔记文件(例如,`Note.md`)。
**注意**: 您需要下载一个 markdown 编辑器来创建/编辑此文件。我使用的是[Typora](https://typora.io/),它不是完全免费的。您也可以选择其他替代产品。 61 | 4. 根据下面第4节(识别规则)在您的 markdown 笔记文件中列出格式化的论文标题。例如:
62 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
63 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
64 | **(注意‘\-’后的空格)** 65 | 5. 创建一个文件夹来存储下载的 pdf 文件(例如,`PDFs/`)。 66 | 6. 运行 `easyliter -i <您的 md 文件路径> -o <您的 pdf 文件夹路径>`。 67 |
**注意**:将 `<您的 md 文件路径>` 替换为您 markdown 笔记文件的实际路径,将 `<您的 pdf 文件夹路径>` 替换为您 pdf 文件夹的实际路径。 68 |
例如:`easyliter -i "/home/Note.md" -o "/home/PDFs"` 69 | 7. 如果没有报错,您应该能够看到更新的信息和下载的 pdf 文件。 70 | 8. 这是一个简单、常用的使用案例。有关其他功能或使用情形,请仔细阅读以下部分并按照说明操作。 71 | 72 | ## 2. Install (安装) 73 | ### pip install 74 | ```bash 75 | pip install easyliter 76 | or 77 | pip3 install easyliter 78 | ``` 79 | 80 | ### install from source(to get the up-to-date version) 81 | ```bash 82 | git clone https://github.com/Psycoy/EasyLiterature.git 83 | cd EasyLiterature 84 | pip install -e . 85 | ``` 86 | 87 | ## 3. Arguments(使用参数) 88 | ```bash 89 | easyliter 90 | 91 | optional arguments: 92 | 93 | -h, --help show this help message and exit 94 | 95 | -i INPUT, --input INPUT 96 | The path to the note file or note file folder. 97 | 98 | -o OUTPUT, --output OUTPUT 99 | Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER. 100 | 101 | -p PROXY, --proxy PROXY 102 | The proxy. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html. 103 | 104 | -gp GPROXY_MODE, --gproxy_mode GPROXY_MODE 105 | The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.). 106 | 107 | -d, --delete 108 | Delete unreferenced attachments in notes. Use with caution, when used, -i must be a folder path including all notes. 109 | 110 | -m MIGRATION, --migration MIGRATION 111 | The pdf folder path you want to reconnect to. 112 | ``` 113 | 114 | 115 | ## 4. Recognition Rules (识别规则): 116 | - If the notes file contains `- {paper_id}`, it will download the information of that literature, but not the PDF. 117 | - If the notes file contains `- {{paper_id}}`, it will download both the information of that literature and the PDF. 118 | 119 | - Note: `paper_id` supports `article title`, published articles' `doi`, and pre-published articles' `arvix_id`, `biorvix_id`, and `medrvix_id`. It will try all the possible sources online. 120 | 121 | ___ 122 | 123 | - 当笔记文件中包含 `- {paper_id}`时候,会下载该文献的信息,不下载PDF。 124 | - 当笔记文件中包含 `- {{paper_id}}`时候,会下载该文献的信息,以及PDF。 125 | 126 | - 注意:`paper_id` 支持`文章标题`,已发表文章的`doi`, 预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。EasyLiterature会从多个数据库自动识别需要收集和下载的论文,几乎覆盖所有目前存在的论文。 127 | 128 | 129 | ## 5. Usage(使用) 130 | ### 5.1. Basic Usage(基本使用) 131 | Assuming `input` is the folder path of the literature notes (.md files) and `output` is the folder path where you want to save the PDFs. 132 | 133 | 假设`input`为文献笔记(md文件)的文件夹路径,`output`为要保存PDF的文件夹路径。 134 | 135 | ```bash 136 | # Update all md files in the input folder 137 | # 更新input文件夹下所有md文件 138 | easyliter -i input -o output 139 | 140 | # Only update the input/example.md file 141 | # 仅更新input/example.md文件 142 | easyliter -i input/example.md -o output 143 | 144 | # -d is an optional flag, when -i is a folder path, using -d will delete unrelated pdf files in the PDF folder from the literature notes content 145 | # -d 是个可选项,当 -i 是文件夹路径时候,使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件 146 | easyliter -i input -o output -d 147 | ``` 148 | 149 | ### 5.2. Migrating Notes and PDF Files(笔记和pdf文件的迁移) 150 | When you need to move the literature notes or the PDF folder, the links to the PDFs in the literature notes might become unusable. You can use `-m` to re-link the PDF files with the literature notes. 151 | 152 | 当要移动文献笔记或者PDF文件夹的时候,文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。 153 | 154 | ```bash 155 | # Update all md files in the input folder 156 | # 更新input文件夹下所有md文件 157 | easyliter -i input -m movedPDFs/ 158 | 159 | # Only update the input/example.md file 160 | # 仅更新input/example.md文件 161 | easyliter -i input/example.md -m movedPDFs/ 162 | ``` 163 | 164 | ## 6. Note (注意事项) 165 | 166 | 1. For users from China mainland, the Google Scholar feature may need a VPN to get it work (the citation function is based on the Google Scholar). If you don't have a VPN, some features may be lost. 167 | 168 | - 对于来自中国大陆的用户,Google Scholar相关功能可能需要 VPN 才能正常工作(引用功能基于 Google scholar)。如果没有挂VPN,某些功能可能会丢失,但不完全影响使用。 169 | 170 | 2. If your Google Scholar is not working (usually caused by too frequent requests of the Google Scholar API), try to set a proxy for it. Check out the help for `-p` and `-gp` options using `easyliter -h`. See more at the 'Using proxies' section of https://scholarly.readthedocs.io/en/stable/quickstart.html. 171 | 172 | - 如果Google Scholar 无法使用(通常由于对Google Scholar API的访问过于频繁),尝试为其设置代理。使用 easyliter -h 查看 -p 和 -gp 选项的帮助信息来设置代理。详见 https://scholarly.readthedocs.io/en/stable/quickstart.html 的 Using proxies部分。 173 | -------------------------------------------------------------------------------- /build/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/build/.DS_Store -------------------------------------------------------------------------------- /build/lib/easy_literature/DBLP.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from urllib.request import ProxyHandler 3 | from . import dblp_source as dblp 4 | import pandas as pd 5 | 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('DBLP') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class DBLPInfo(object): 13 | 14 | def set_proxy(self, proxy_address = None): 15 | """set proxy handler 16 | 17 | Aargs: 18 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 19 | 20 | Returns: 21 | A proxy handler object. 22 | """ 23 | pass 24 | 25 | 26 | def extract_json_info(self, item): 27 | """Extract bib json information from requests.get().json() 28 | 29 | Args: 30 | item (json object): obtained by requests.get().json() 31 | 32 | Returns: 33 | A dict containing the paper information. 34 | """ 35 | trial_num = 0 36 | while trial_num<10: 37 | trial_num+=1 38 | try: 39 | results = dblp.search([item]) 40 | break 41 | except: 42 | if trial_num == 10: 43 | results = pd.DataFrame({'A' : []}) 44 | else: 45 | pass 46 | 47 | 48 | 49 | if not results.empty: 50 | if 'CoRR' in [str(venue) for venue in results['Where']]: 51 | journal = 'CoRR' 52 | for venue in results['Where']: 53 | if str(venue) != 'CoRR': 54 | journal = str(venue) 55 | break 56 | 57 | str(results['Where']) 58 | bib_dict = { 59 | "title": str(results['Title'][0]), 60 | "author": ' and '.join([str(Entry) for Entry in results['Authors'][0]]), 61 | "journal": journal, 62 | "year": str(results['Year'][0]), 63 | "url": str(results['Link'][0]), 64 | "pdf_link": None, 65 | "cited_count": None 66 | } 67 | else: 68 | bib_dict = None 69 | return bib_dict 70 | 71 | 72 | def get_info_by_title(self, title): 73 | """Get the meta information by the given paper title. 74 | 75 | Args: 76 | doi (str): The paper title 77 | 78 | Returns: 79 | A dict containing the paper information. 80 | { 81 | "title": xxx, 82 | "author": xxx, 83 | "journal": xxx, 84 | etc 85 | } 86 | OR 87 | None 88 | OR 89 | A list [{}, {}, {}] 90 | """ 91 | return self.extract_json_info(title) 92 | 93 | 94 | if __name__ == "__main__": 95 | # arxivId = "2208.05623" 96 | # title = "Heterogeneous Graph Attention Network" 97 | 98 | # gscholar_info = GscholarInfo() 99 | # gscholar_info.set_proxy(proxy_name='single') 100 | 101 | # bib_arxiv = gscholar_info.get_info_by_title(title) 102 | # # bib_title = arxiv_info.get_info_by_title(title) 103 | 104 | # print(bib_arxiv) 105 | # print("\n") 106 | # # print(bib_title) 107 | results = dblp.search(["Finetunedlanguage models are zero-shot learners"]) 108 | 109 | print(results) -------------------------------------------------------------------------------- /build/lib/easy_literature/GoogleScholar.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scholarly import scholarly, ProxyGenerator 3 | 4 | 5 | logging.basicConfig() 6 | logger = logging.getLogger('GoogleScholar') 7 | logger.setLevel(logging.DEBUG) 8 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 9 | 10 | class GscholarInfo(object): 11 | 12 | def set_proxy(self, proxy_name = "free", proxy_address = None): 13 | """set proxy handler 14 | 15 | Aargs: 16 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 17 | 18 | Returns: 19 | A proxy handler object. 20 | """ 21 | # TODO find a better proxy strategy 22 | if proxy_address: 23 | sucess = False 24 | pg = ProxyGenerator() 25 | if proxy_name == "free": 26 | sucess = pg.FreeProxies() 27 | elif proxy_name == "single": 28 | sucess = pg.SingleProxy(http = proxy_address, https = proxy_address) 29 | elif proxy_name == "Scraper": 30 | sucess = pg.ScraperAPI('a44bd5be9f56b1be9d6e40116ea4b440') 31 | logger.info(f'Scholarly using {proxy_name} proxy.') 32 | logger.info(f'Proxy setup sucess: {sucess}.') 33 | scholarly.use_proxy(pg) 34 | 35 | 36 | def extract_json_info(self, item): 37 | """Extract bib json information from requests.get().json() 38 | 39 | Args: 40 | item (json object): obtained by requests.get().json() 41 | 42 | Returns: 43 | A dict containing the paper information. 44 | """ 45 | bib_dict = None 46 | trial_num = 0 47 | 48 | while trial_num<9: 49 | try: 50 | trial_num+=1 51 | pubs_iter = scholarly.search_pubs(item) 52 | dictinfo = next(pubs_iter) 53 | # logger.info(dictinfo) 54 | bib_dict = { 55 | "title": dictinfo['bib']['title'].replace('\n', ''), 56 | "author": ' and '.join(dictinfo['bib']['author']), 57 | "journal": dictinfo['bib']['venue'], 58 | "year": dictinfo['bib']['pub_year'], 59 | "url": dictinfo['pub_url'], 60 | "pdf_link": dictinfo['eprint_url'], 61 | "cited_count": dictinfo['num_citations'] 62 | } 63 | break 64 | except: 65 | pass 66 | 67 | return bib_dict 68 | 69 | 70 | 71 | def get_info_by_title(self, title): 72 | """Get the meta information by the given paper title. 73 | 74 | Args: 75 | doi (str): The paper title 76 | 77 | Returns: 78 | A dict containing the paper information. 79 | { 80 | "title": xxx, 81 | "author": xxx, 82 | "journal": xxx, 83 | etc 84 | } 85 | OR 86 | None 87 | OR 88 | A list [{}, {}, {}] 89 | """ 90 | return self.extract_json_info(title) 91 | 92 | 93 | if __name__ == "__main__": 94 | arxivId = "2208.05623" 95 | title = "Heterogeneous Graph Attention Network" 96 | 97 | gscholar_info = GscholarInfo() 98 | gscholar_info.set_proxy(proxy_name='free') 99 | 100 | bib_arxiv = gscholar_info.get_info_by_title(title) 101 | # bib_title = arxiv_info.get_info_by_title(title) 102 | 103 | print(bib_arxiv) 104 | print("\n") 105 | # print(bib_title) -------------------------------------------------------------------------------- /build/lib/easy_literature/Scholarly.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scholarly import scholarly 3 | from scholarly import ProxyGenerator 4 | 5 | # Set up a ProxyGenerator object to use free proxies 6 | # This needs to be done only once per session 7 | pg = ProxyGenerator() 8 | 9 | sucess = pg.FreeProxies() 10 | # print(f'Proxy setup sucess: {sucess}.') 11 | scholarly.use_proxy(pg) 12 | 13 | # will paginate to the next page by default 14 | pubs_iter = scholarly.search_pubs("1810.04805") 15 | 16 | 17 | print(json.dumps(next(pubs_iter), indent=2)) 18 | -------------------------------------------------------------------------------- /build/lib/easy_literature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/build/lib/easy_literature/__init__.py -------------------------------------------------------------------------------- /build/lib/easy_literature/arxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from urllib.request import ProxyHandler 3 | import feedparser 4 | try: 5 | from urllib import quote 6 | except ImportError: 7 | from urllib.parse import quote 8 | from unidecode import unidecode 9 | 10 | from .crossref import crossrefInfo 11 | 12 | 13 | logging.basicConfig() 14 | logger = logging.getLogger('arxiv') 15 | logger.setLevel(logging.DEBUG) 16 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 17 | 18 | class arxivInfo(object): 19 | def __init__(self): 20 | self.base_url = "http://export.arxiv.org/api/query" 21 | 22 | def set_proxy_handler(self, proxy): 23 | """set proxy handler 24 | 25 | Aargs: 26 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 27 | 28 | Returns: 29 | A proxy handler object. 30 | """ 31 | proxy_handler = ProxyHandler({"http": f"http://{proxy}", 32 | "https": f"https://{proxy}"}) 33 | return proxy_handler 34 | 35 | 36 | def extract_json_info(self, item): 37 | """Extract bib json information from requests.get().json() 38 | 39 | Args: 40 | item (json object): obtained by requests.get().json() 41 | 42 | Returns: 43 | A dict containing the paper information. 44 | """ 45 | paper_url = item.link 46 | title = item.title 47 | journal = "arxiv" 48 | published = item.published.split("-") 49 | if len(published) > 1: 50 | year = published[0] 51 | else: 52 | year = ' ' 53 | 54 | authors = item.authors 55 | if len(authors) > 0: 56 | first_author = authors[0]["name"].split(" ") 57 | authors = " and ".join([author["name"] for author in authors]) 58 | else: 59 | first_author = authors 60 | authors = authors 61 | 62 | bib_dict = { 63 | "title": title, 64 | "author": authors, 65 | "journal": journal, 66 | "year": year, 67 | "url": paper_url, 68 | "pdf_link": item.link.replace("abs", "pdf")+".pdf", 69 | "cited_count": None 70 | } 71 | 72 | return bib_dict 73 | 74 | 75 | def get_info_by_arxivid(self, arxivId, handler=False): 76 | """Get the meta information by the given paper arxiv_id. 77 | 78 | Args: 79 | doi (str): The arxiv Id 80 | handler (handler object): use proxy 81 | 82 | Returns: 83 | A dict containing the paper information. 84 | { 85 | "title": xxx, 86 | "author": xxx, 87 | "journal": xxx, 88 | etc 89 | } 90 | OR 91 | None 92 | """ 93 | 94 | params = "?search_query=id:"+quote(unidecode(arxivId)) 95 | 96 | try: 97 | if handler: 98 | result = feedparser.parse(self.base_url + params, handlers=[handler]) 99 | else: 100 | result = feedparser.parse(self.base_url + params) 101 | items = result.entries 102 | 103 | item = items[0] 104 | if "arxiv_doi" in item: 105 | doi = item["arxiv_doi"] 106 | 107 | crossref_info = crossrefInfo() 108 | if handler: 109 | crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1]) 110 | return crossref_info.get_info_by_doi(doi) 111 | else: 112 | return self.extract_json_info(item) 113 | except: 114 | logger.error("DOI: {} is error.".format(arxivId)) 115 | 116 | 117 | def get_info_by_title(self, title, field='ti'): 118 | """Get the meta information by the given paper title. 119 | 120 | Args: 121 | doi (str): The paper title 122 | 123 | Returns: 124 | A dict containing the paper information. 125 | { 126 | "title": xxx, 127 | "author": xxx, 128 | "journal": xxx, 129 | etc 130 | } 131 | OR 132 | None 133 | OR 134 | A list [{}, {}, {}] 135 | """ 136 | params = "?search_query="+field+":"+quote(unidecode(title)) 137 | url = self.base_url + params 138 | try: 139 | result = feedparser.parse(url) 140 | items = result.entries 141 | print(len(items)) 142 | 143 | for i, item in enumerate(items): 144 | 145 | title_item = item.title 146 | try: 147 | title_item = title_item.decode("utf-8") 148 | except: 149 | pass 150 | 151 | item.title = title_item 152 | 153 | if title_item.lower() == title.lower(): 154 | return self.extract_json_info(item) 155 | 156 | items[i] = item 157 | 158 | return [self.extract_json_info(it) for it in items] 159 | except: 160 | logger.error("Title: {} is error.".format(title)) 161 | 162 | 163 | if __name__ == "__main__": 164 | arxivId = "2208.05623" 165 | title = "Heterogeneous Graph Attention Network" 166 | 167 | arxiv_info = arxivInfo() 168 | arxiv_info.set_proxy_handler(proxy="127.0.1:1123") 169 | 170 | bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId) 171 | 172 | print(bib_arxiv) 173 | print("\n") -------------------------------------------------------------------------------- /build/lib/easy_literature/crossref.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | # 4 | # 1. get info by doi 5 | # 2. get info by title 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('crossref') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class crossrefInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "http://api.crossref.org/" 17 | 18 | def set_proxy(self, proxy=None): 19 | """set proxy for session 20 | 21 | Args: 22 | proxy (str): The proxy adress. e.g 127.0.1:1123 23 | Returns: 24 | None 25 | """ 26 | if proxy: 27 | self.sess.proxies = { 28 | "http": proxy, 29 | "https": proxy, } 30 | 31 | 32 | def extract_json_info(self, bib): 33 | """Extract bib json information from requests.get().json() 34 | 35 | Args: 36 | bib (json object): obtained by requests.get().json() 37 | 38 | Returns: 39 | A dict containing the paper information. 40 | """ 41 | pub_date = [str(i) for i in bib['published']["date-parts"][0]] 42 | pub_date = '-'.join(pub_date) 43 | 44 | if 'author' in bib.keys(): 45 | authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()]) 46 | else: 47 | authors = "No author" 48 | 49 | if 'short-container-title' in bib.keys(): 50 | try: 51 | journal = bib['short-container-title'][0] 52 | except: 53 | journal = "No journal" 54 | else: 55 | try: 56 | journal = bib['container-title'][0] 57 | except: 58 | journal = "No journal" 59 | 60 | bib_dict = { 61 | "title": bib['title'][0], 62 | "author": authors, 63 | "journal": journal, 64 | "year": pub_date, 65 | "url": bib["URL"], 66 | "pdf_link": bib["link"][0]["URL"], 67 | "cited_count": bib["is-referenced-by-count"] 68 | } 69 | 70 | return bib_dict 71 | 72 | 73 | def get_info_by_doi(self, doi): 74 | """Get the meta information by the given paper DOI number. 75 | 76 | Args: 77 | doi (str): The paper DOI number 78 | 79 | Returns: 80 | A dict containing the paper information. 81 | { 82 | "title": xxx, 83 | "author": xxx, 84 | "journal": xxx, 85 | etc 86 | } 87 | OR 88 | None 89 | """ 90 | url = "{}works/{}" 91 | url = url.format(self.base_url, doi) 92 | 93 | try: 94 | r = self.sess.get(url) 95 | 96 | bib = r.json()['message'] 97 | return self.extract_json_info(bib) 98 | 99 | except: 100 | logger.error("DOI: {} is error.".format(doi)) 101 | 102 | 103 | def get_info_by_title(self, title): 104 | """Get the meta information by the given paper title. 105 | 106 | Args: 107 | doi (str): The paper title 108 | 109 | Returns: 110 | A dict containing the paper information. 111 | { 112 | "title": xxx, 113 | "author": xxx, 114 | "journal": xxx, 115 | etc 116 | } 117 | OR 118 | None 119 | OR 120 | A list [{}, {}, {}] 121 | """ 122 | url = self.base_url + "works" 123 | params = {"query.bibliographic": title, "rows": 20} 124 | try: 125 | r = self.sess.get(url, params=params) 126 | items = r.json()["message"]["items"] 127 | 128 | for i, item in enumerate(items): 129 | 130 | title_item = item['title'][0] 131 | try: 132 | title_item = title_item.decode("utf-8") 133 | except: 134 | pass 135 | 136 | item["title"][0] = title_item 137 | 138 | if title_item.lower() == title.lower(): 139 | return self.extract_json_info(item) 140 | 141 | items[i] = item 142 | 143 | return [self.extract_json_info(it) for it in items] 144 | except: 145 | logger.error("Title: {} is error.".format(title)) 146 | 147 | 148 | if __name__ == "__main__": 149 | # doi = "10.1016/j.wneu.2012.11.074" 150 | # doi = "10.1093/cercor/bhac266" 151 | doi = "10.1038/s41467-022-29269-6" 152 | # title = "Heterogeneous Graph Attention Network" 153 | # title = "Learning to Copy Coherent Knowledge for Response Generation" 154 | 155 | crossref_info = crossrefInfo() 156 | crossref_info.set_proxy(proxy="127.0.1:1123") 157 | 158 | bib_doi = crossref_info.get_info_by_doi(doi) 159 | # bib_title = crossref_info.get_info_by_title(title) 160 | 161 | print(bib_doi) 162 | print("\n") 163 | # print(bib_title) 164 | -------------------------------------------------------------------------------- /build/lib/easy_literature/dblp_source.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | #options 6 | STRINGS_FOR_TEST = ["Collaborative Writing"] 7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/' 8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/" 9 | 10 | 11 | def query_db(pub_string=STRINGS_FOR_TEST): 12 | ''' 13 | returns the BeautifulSoup object of a query to DBLP 14 | 15 | :param pub_string: A list of strings of keywords 16 | :return: BeautifulSoup: A BeautifulSoup Object 17 | ''' 18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string}) 19 | return BeautifulSoup(resp.content) 20 | 21 | def get_pub_data(pub): 22 | ''' 23 | Extracts the information about a publication from a BeautifulSoup object 24 | 25 | :param pub: A BeautifulSoup Object with Publication Information 26 | :return: dict: All Information of this Publication 27 | ''' 28 | ptype = 'nothing' 29 | link = 'nothing' 30 | authors = [] 31 | title = 'nothing' 32 | where = 'nothing' 33 | 34 | if 'year' in pub.get('class'): 35 | # year is not always scrapable, except for this case. Might be done more elegantly 36 | return int(pub.contents[0]) 37 | else: 38 | ptype = pub.attrs.get('class')[1] 39 | for content_item in pub.contents: 40 | class_of_content_item = content_item.attrs.get('class', [0]) 41 | if 'data' in class_of_content_item: 42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}): 43 | authors.append(author.text) 44 | title = content_item.find('span', attrs={"class": "title"}).text 45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}): 46 | found_where = where_data.find('span', attrs={"itemprop": "name"}) 47 | if found_where: 48 | where = found_where.text 49 | elif 'publ' in class_of_content_item: 50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing") 51 | 52 | return {'Type': ptype, 53 | 'Link': link, 54 | 'Authors': authors, 55 | 'Title': title, 56 | 'Where': where} 57 | 58 | def search(search_string=STRINGS_FOR_TEST): 59 | ''' 60 | returns the information found in a search query to dblp as a pandas dataframe. 61 | Shows the following information: 62 | - Authors 63 | - Link to Publication 64 | - Title 65 | - Type (Article, Proceedings etc.) 66 | - Where it was published 67 | - Year of publication 68 | :param search_string: A List of Strings of Keywords, that should be searched for 69 | :return: pd.DataFrame: A Dataframe with all data 70 | ''' 71 | soup = query_db(search_string) 72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"}) 73 | 74 | pub_list_data = [] 75 | curr_year = 0 76 | for child in pub_list_raw.children: 77 | pub_data = get_pub_data(child) 78 | if type(pub_data) == int: 79 | curr_year = pub_data 80 | else: 81 | pub_data['Year'] = curr_year 82 | pub_list_data.append(pub_data) 83 | 84 | return pd.DataFrame(pub_list_data) 85 | -------------------------------------------------------------------------------- /build/lib/easy_literature/dlbp.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | #options 6 | STRINGS_FOR_TEST = ["Collaborative Writing"] 7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/' 8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/" 9 | 10 | 11 | def query_db(pub_string=STRINGS_FOR_TEST): 12 | ''' 13 | returns the BeautifulSoup object of a query to DBLP 14 | 15 | :param pub_string: A list of strings of keywords 16 | :return: BeautifulSoup: A BeautifulSoup Object 17 | ''' 18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string}) 19 | return BeautifulSoup(resp.content) 20 | 21 | def get_pub_data(pub): 22 | ''' 23 | Extracts the information about a publication from a BeautifulSoup object 24 | 25 | :param pub: A BeautifulSoup Object with Publication Information 26 | :return: dict: All Information of this Publication 27 | ''' 28 | ptype = 'nothing' 29 | link = 'nothing' 30 | authors = [] 31 | title = 'nothing' 32 | where = 'nothing' 33 | 34 | if 'year' in pub.get('class'): 35 | # year is not always scrapable, except for this case. Might be done more elegantly 36 | return int(pub.contents[0]) 37 | else: 38 | ptype = pub.attrs.get('class')[1] 39 | for content_item in pub.contents: 40 | class_of_content_item = content_item.attrs.get('class', [0]) 41 | if 'data' in class_of_content_item: 42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}): 43 | authors.append(author.text) 44 | title = content_item.find('span', attrs={"class": "title"}).text 45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}): 46 | found_where = where_data.find('span', attrs={"itemprop": "name"}) 47 | if found_where: 48 | where = found_where.text 49 | elif 'publ' in class_of_content_item: 50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing") 51 | 52 | return {'Type': ptype, 53 | 'Link': link, 54 | 'Authors': authors, 55 | 'Title': title, 56 | 'Where': where} 57 | 58 | def search(search_string=STRINGS_FOR_TEST): 59 | ''' 60 | returns the information found in a search query to dblp as a pandas dataframe. 61 | Shows the following information: 62 | - Authors 63 | - Link to Publication 64 | - Title 65 | - Type (Article, Proceedings etc.) 66 | - Where it was published 67 | - Year of publication 68 | :param search_string: A List of Strings of Keywords, that should be searched for 69 | :return: pd.DataFrame: A Dataframe with all data 70 | ''' 71 | soup = query_db(search_string) 72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"}) 73 | 74 | pub_list_data = [] 75 | curr_year = 0 76 | for child in pub_list_raw.children: 77 | pub_data = get_pub_data(child) 78 | if type(pub_data) == int: 79 | curr_year = pub_data 80 | else: 81 | pub_data['Year'] = curr_year 82 | pub_list_data.append(pub_data) 83 | 84 | return pd.DataFrame(pub_list_data) 85 | -------------------------------------------------------------------------------- /build/lib/easy_literature/downloads.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import os 4 | import platform 5 | 6 | from .arxiv import arxivInfo 7 | from .crossref import crossrefInfo 8 | from .medbiorxiv import BMxivInfo 9 | from .GoogleScholar import GscholarInfo 10 | from .DBLP import DBLPInfo 11 | from .pdfs import pdfDownload 12 | 13 | # log config 14 | logging.basicConfig() 15 | logger = logging.getLogger('Downloads') 16 | logger.setLevel(logging.INFO) 17 | 18 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 19 | 20 | 21 | 22 | def check_string(re_exp, str): 23 | res = re.match(re_exp, str) 24 | if res: 25 | return True 26 | else: 27 | return False 28 | 29 | def classify(identifier): 30 | """ 31 | Classify the type of paper_id: 32 | arxivId - arxivId 33 | doi - digital object identifier 34 | medbiorxivId - medrxiv or biorxiv id 35 | title - title 36 | """ 37 | if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier): 38 | return 'doi' 39 | elif check_string(r'10\.1101/\.*', identifier): 40 | return "medbiorxivId" 41 | elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier): 42 | return 'arxivId' 43 | elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier): 44 | return 'title' 45 | else: 46 | return "unrecognized" 47 | 48 | def get_paper_info_from_paperid(paper_id, proxy=None, gproxy_mode='free'): 49 | id_type = classify(paper_id) 50 | 51 | if id_type == "doi": 52 | logger.info('ID type: doi.') 53 | downloader = crossrefInfo() 54 | if proxy: 55 | downloader.set_proxy(proxy=proxy) 56 | bib_dict = downloader.get_info_by_doi(paper_id) 57 | 58 | elif id_type == "arxivId": 59 | logger.info('ID type: arixiv.') 60 | downloader = arxivInfo() 61 | if proxy: 62 | downloader.set_proxy_handler(proxy=proxy) 63 | bib_dict = downloader.get_info_by_arxivid(paper_id) 64 | 65 | elif id_type == "medbiorxivId": 66 | logger.info('ID type: medbiorxivId.') 67 | downloader = BMxivInfo() 68 | if proxy: 69 | downloader.set_proxy(proxy=proxy) 70 | bib_dict = downloader.get_info_by_bmrxivid(paper_id) 71 | 72 | elif id_type == "title": 73 | logger.info('ID type: title.') 74 | downloader1 = GscholarInfo() 75 | downloader1.set_proxy(proxy_name=gproxy_mode, proxy_address=proxy) 76 | bib_dict = downloader1.get_info_by_title(paper_id) 77 | 78 | downloader2 = DBLPInfo() 79 | downloader2.set_proxy(proxy_address=proxy) 80 | bib_dict1 = downloader2.get_info_by_title(paper_id) 81 | 82 | logger.info(f'The Google scholar bib: {bib_dict}; The DLBP bib: {bib_dict1}.') 83 | 84 | if bib_dict is not None and bib_dict1 is not None: 85 | bib_dict['journal'] = bib_dict1['journal'] 86 | elif bib_dict is None and bib_dict1 is not None: 87 | bib_dict = bib_dict1 88 | elif bib_dict is None and bib_dict1 is None: 89 | logger.info('Title not found on DLBP and Google scholar.') 90 | else: 91 | pass 92 | 93 | try: 94 | return bib_dict 95 | except: 96 | pass 97 | 98 | 99 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None): 100 | pdf_downloader = pdfDownload() 101 | if proxy: 102 | pdf_downloader.set_proxy(proxy=proxy) 103 | 104 | if direct_url: 105 | content = pdf_downloader.get_pdf_from_direct_url(direct_url) 106 | if not content: 107 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 108 | else: 109 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 110 | try: 111 | system = platform.system() 112 | if system == 'Windows': 113 | path = path.replace("/", "\\") 114 | pdf_dir = path.rsplit("\\", 1)[0] 115 | else: 116 | pdf_dir = path.rsplit("/", 1)[0] 117 | if not os.path.exists(pdf_dir): 118 | os.makedirs(pdf_dir) 119 | pdf_downloader._save(content['pdf'], path) 120 | except: 121 | pass 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /build/lib/easy_literature/easyliter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | import os 4 | 5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('easyliter') 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | 13 | 14 | def set_args(): 15 | parser = argparse.ArgumentParser(description='EasyLiterature') 16 | parser.add_argument('-i', '--input', required=True, type=str, default=None, 17 | help="The path to the note file or note file folder.") 18 | parser.add_argument('-o', '--output', type=str, default=None, 19 | help='Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.') 20 | parser.add_argument('-p', '--proxy', type=str, default=None, 21 | help='The proxy address. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.') 22 | parser.add_argument('-gp', '--gproxy_mode', type=str, default='free', 23 | help='The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).') 24 | parser.add_argument('-d', '--delete', action='store_true', 25 | help='Delete unreferenced attachments in notes. Use with caution, ' 26 | 'when used, -i must be a folder path including all notes.') 27 | parser.add_argument('-m', '--migration', type=str, default=None, 28 | help="The pdf folder path you want to reconnect to.") 29 | args = parser.parse_args() 30 | 31 | return args 32 | 33 | def check_args(): 34 | args = set_args() 35 | input_path = args.input 36 | output_path = args.output 37 | delete_bool = args.delete 38 | migration_path = args.migration 39 | proxy = args.proxy 40 | gproxy_mode = args.gproxy_mode 41 | 42 | return input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode 43 | 44 | 45 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer, gproxy_mode): 46 | 47 | pdfs_path = output_path 48 | if not os.path.exists(pdfs_path): 49 | os.makedirs(pdfs_path) 50 | 51 | with open(note_file, 'r') as f: 52 | content = f.read() 53 | 54 | m = paper_recognizer.findall(content) 55 | logger.info("Number of files to download - {}".format(len(m))) 56 | 57 | if not m: 58 | logger.info("The file {} is not found, or there is no valid entry in the file.".format(note_file)) 59 | else: 60 | replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy, gproxy_mode=gproxy_mode) 61 | 62 | return replace_dict 63 | 64 | 65 | def file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode): 66 | 67 | replace_dict = get_bib_and_pdf(input_path, output_path, 68 | proxy, paper_recognizer, gproxy_mode) 69 | 70 | if replace_dict: 71 | note_modified(paper_recognizer, input_path, **replace_dict) 72 | 73 | 74 | def main(): 75 | input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode = check_args() 76 | 77 | if output_path: 78 | paper_recognizer = patternRecognizer(r'- \{.{3,}\}') 79 | 80 | if os.path.isfile(input_path): 81 | logger.info("Updating the file {}".format(input_path)) 82 | file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode) 83 | 84 | elif os.path.isdir(input_path): 85 | note_paths = [] 86 | for root, _, files in os.walk(input_path): 87 | for file in files: 88 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 89 | note_paths.append(os.path.join(root, file)) 90 | for note_path in note_paths: 91 | logger.info("Updating the file {}".format(note_path)) 92 | file_update(note_path, output_path, proxy, paper_recognizer, gproxy_mode) 93 | else: 94 | logger.info("input path {} does not exist".format(input_path)) 95 | 96 | 97 | # Delete unreferenced attachments 98 | if delete_bool: 99 | if os.path.isfile(input_path): 100 | logger.info("To delete the PDF entities unrelated to the notes, the input path must be the main notes folder!!! Please use this parameter with caution!!!") 101 | else: 102 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 103 | pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer) 104 | pdf_paths = get_pdf_paths(output_path) 105 | # TODO the path between mac and win could be different,“/” 和 “\\” 106 | pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes] 107 | pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths] 108 | 109 | removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes)) 110 | try: 111 | for pdf_p in removed_pdf_paths: 112 | os.remove(pdf_p) 113 | except: 114 | pass 115 | 116 | logger.info("Deleted {} files".format(len(removed_pdf_paths))) 117 | 118 | 119 | if migration_path: 120 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 121 | 122 | pdf_paths = get_pdf_paths(migration_path) 123 | pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer) 124 | 125 | # match based on paper title 126 | matched_numb = 0 127 | pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths} 128 | for md_file, pdf_paths_ in pdf_paths_in_notes.items(): 129 | 130 | pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_} 131 | matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys() 132 | 133 | matched_numb += len(matched_pdfs) 134 | 135 | replace_paths_dict = {} 136 | for matched in matched_pdfs: 137 | replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1] 138 | replaced_str = "[pdf]({})".format(replaced_str) 139 | ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched]) 140 | replace_paths_dict[ori_str] = replaced_str 141 | 142 | if replace_paths_dict: 143 | note_modified(pdf_path_recognizer, md_file, **replace_paths_dict) 144 | 145 | logger.info("Found - {} - pdf files".format(matched_numb)) 146 | 147 | 148 | if not output_path and not migration_path: 149 | logger.info("lacking the arguments -o or -m, use -h to see the help") 150 | 151 | 152 | if __name__ == "__main__": 153 | main() -------------------------------------------------------------------------------- /build/lib/easy_literature/medbiorxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | from .crossref import crossrefInfo 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('biorxiv') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class BMxivInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "https://api.biorxiv.org/details/" 17 | self.servers = ["biorxiv", "medrxiv"] 18 | 19 | 20 | def set_proxy(self, proxy=False): 21 | """set proxy for session 22 | 23 | Args: 24 | proxy (str): The proxy adress. e.g 127.0.1:1123 25 | Returns: 26 | None 27 | """ 28 | if proxy: 29 | self.sess.proxies = { 30 | "http": proxy, 31 | "https": proxy, } 32 | 33 | 34 | def extract_json_info(self, item): 35 | """Extract bib json information from requests.get().json() 36 | 37 | Args: 38 | item (json object): obtained by requests.get().json() 39 | 40 | Returns: 41 | A dict containing the paper information. 42 | """ 43 | paper_url = f"https://www.biorxiv.org/content/{item['doi']}" 44 | title = item["title"] 45 | journal = item["server"] 46 | published = item["date"].split('-') 47 | if len(published) > 1: 48 | year = published[0] 49 | else: 50 | year = ' ' 51 | 52 | authors = item['authors'].split("; ") 53 | if len(authors) > 0: 54 | authors = " and ".join([author for author in authors]) 55 | else: 56 | authors = authors 57 | 58 | bib_dict = { 59 | "title": title, 60 | "author": authors, 61 | "journal": journal, 62 | "year": year, 63 | "url": paper_url, 64 | "pdf_link": f"{paper_url}.full.pdf", 65 | "cited_count": None 66 | } 67 | 68 | return bib_dict 69 | 70 | 71 | def get_info_by_bmrxivid(self, bmrxivid): 72 | """Get the meta information by the given paper biorxiv_id or medrxiv_id. 73 | 74 | Args: 75 | doi (str): The biorxiv or medrxiv Id 76 | 77 | Returns: 78 | A dict containing the paper information. 79 | { 80 | "title": xxx, 81 | "author": xxx, 82 | "journal": xxx, 83 | etc 84 | } 85 | OR 86 | None 87 | """ 88 | urls = [self.base_url + server + "/" + bmrxivid for server in self.servers] 89 | for url in urls: 90 | try: 91 | r = self.sess.get(url) 92 | 93 | bib = r.json()['collection'][-1] 94 | 95 | if "published" in bib.keys() and bib['published'] != "NA": 96 | doi = bib["published"] 97 | print(doi) 98 | crossref_info = crossrefInfo() 99 | if len(self.sess.proxies) > 0: 100 | crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1]) 101 | return crossref_info.get_info_by_doi(doi) 102 | 103 | return self.extract_json_info(bib) 104 | 105 | except: 106 | logger.error("DOI: {} is error.".format(bmrxivid)) 107 | 108 | 109 | def get_info_by_title(self, title): 110 | """Get the meta information by the given paper title. 111 | 112 | Args: 113 | doi (str): The paper title 114 | 115 | Returns: 116 | A dict containing the paper information. 117 | { 118 | "title": xxx, 119 | "author": xxx, 120 | "journal": xxx, 121 | etc 122 | } 123 | OR 124 | None 125 | OR 126 | A list [{}, {}, {}] 127 | """ 128 | base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard" 129 | query = title.replace(' ', '%252B') 130 | 131 | url = base_url.format(query) 132 | try: 133 | result = self.sess.get(url) 134 | soup = BeautifulSoup(result.content, "lxml") 135 | soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix") 136 | 137 | soup_dict = dict() 138 | for sp in soup_items: 139 | key = sp.find("a", class_="highwire-cite-linked-title").span.text 140 | value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "") 141 | soup_dict[key] = value 142 | 143 | for item_title, item_doi in soup_dict.items(): 144 | try: 145 | item_title = item_title.decode("utf-8") 146 | except: 147 | pass 148 | 149 | if item_title.lower() == title.lower(): 150 | return self.get_info_by_bmrxivid(item_doi) 151 | 152 | return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()] 153 | except: 154 | logger.error("Title: {} is error.".format(title)) 155 | 156 | 157 | if __name__ == "__main__": 158 | 159 | arxivId = "10.1101/2022.07.28.22277637" 160 | # title = "Oxygen restriction induces a viable but non-culturable population in bacteria" 161 | # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing" 162 | # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19" 163 | # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome" 164 | 165 | arxiv_info = BMxivInfo() 166 | arxiv_info.set_proxy(proxy="127.0.1:1123") 167 | 168 | bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId) 169 | # bib_title = arxiv_info.get_info_by_title(title) 170 | 171 | print(bib_arxiv) 172 | print("\n") 173 | # print(bib_title) -------------------------------------------------------------------------------- /build/lib/easy_literature/pdfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from urllib.parse import urlunsplit, urlsplit 4 | from bs4 import BeautifulSoup 5 | 6 | logging.basicConfig() 7 | logger = logging.getLogger('PDFs') 8 | logger.setLevel(logging.DEBUG) 9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 10 | 11 | 12 | class pdfDownload(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | 17 | def set_proxy(self, proxy=None): 18 | """set proxy for session 19 | 20 | Args: 21 | proxy (str): The proxy adress. e.g 127.0.1:1123 22 | Returns: 23 | None 24 | """ 25 | if proxy: 26 | self.sess.proxies = { 27 | "http": proxy, 28 | "https": proxy, } 29 | 30 | 31 | def _get_available_scihub_urls(self): 32 | ''' 33 | Finds available scihub urls via https://lovescihub.wordpress.com/ or 34 | https://sci-hub.now.sh/ 35 | ''' 36 | urls = [] 37 | res = self.sess.get('https://lovescihub.wordpress.com/') 38 | s = BeautifulSoup(res.content, 'html.parser') 39 | for a in s.find('div', class_="entry-content").find_all('a', href=True): 40 | if 'sci-hub.' in a['href']: 41 | urls.append(a['href']) 42 | return urls 43 | 44 | 45 | def fetch(self, url, auth=None): 46 | '''Fetch pdf 47 | 48 | Args: 49 | url (str): 50 | 51 | Returns: 52 | A dict OR None 53 | ''' 54 | try: 55 | r = self.sess.get(url, auth=auth) 56 | 57 | if r.headers["Content-Type"] != "application/pdf": 58 | logger.info("Failed to fetch pdf with url: {}".format(url)) 59 | else: 60 | return { 61 | 'pdf': r.content, 62 | 'url': url 63 | } 64 | except: 65 | logger.error("Failed to open url: {}".format(url)) 66 | 67 | 68 | def get_pdf_from_direct_url(self, url, auth=None): 69 | return self.fetch(url, auth=auth) 70 | 71 | 72 | def get_pdf_from_sci_hub(self, identifier, auth=None): 73 | '''Fetch pdf from sci-hub based on doi or url 74 | 75 | Args: 76 | identifier (str): DOI or url 77 | auth (tuple): ("user", "passwd") 78 | 79 | Returns: 80 | A dict OR None 81 | ''' 82 | for base_url in self._get_available_scihub_urls(): 83 | r = self.sess.get(base_url + '/' + identifier, auth=auth) 84 | soup = BeautifulSoup(r.content, 'html.parser') 85 | 86 | pdf_div_names = ['iframe', 'embed'] 87 | for pdf_div_name in pdf_div_names: 88 | pdf_div = soup.find(pdf_div_name) 89 | if pdf_div != None: 90 | break 91 | try: 92 | url_parts = urlsplit(pdf_div.get('src')) 93 | if url_parts[1]: 94 | if url_parts[0]: 95 | pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', '')) 96 | else: 97 | pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', '')) 98 | else: 99 | pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', '')) 100 | 101 | return self.fetch(pdf_url, auth) 102 | except: 103 | pass 104 | 105 | logger.info("Failed to fetch pdf with all sci-hub urls") 106 | 107 | def _save(self, content, path): 108 | with open(path, "wb") as f: 109 | f.write(content) 110 | 111 | 112 | if __name__ == "__main__": 113 | doi = "10.1145/3308558.3313562" 114 | 115 | pdf_download = pdfDownload() 116 | pdf_download.set_proxy("127.0.1:1123") 117 | 118 | pdf_dict = pdf_download.get_pdf_from_sci_hub(doi) 119 | if pdf_dict: 120 | print(pdf_dict['url']) 121 | pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf") 122 | 123 | # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf") 124 | # if pdf_dict2: 125 | # print(pdf_dict2['url']) 126 | # pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf") 127 | 128 | -------------------------------------------------------------------------------- /build/lib/easy_literature/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import re 4 | from tqdm import tqdm 5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid, classify 6 | 7 | 8 | logging.basicConfig() 9 | logger = logging.getLogger('utils') 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | class patternRecognizer(object): 14 | def __init__(self, regular_rule): 15 | self.pattern = re.compile(regular_rule) 16 | 17 | def match(self, string): 18 | return self.pattern.match(string) 19 | 20 | def findall(self, string): 21 | return self.pattern.findall(string) 22 | 23 | def multiple_replace(self, content, **replace_dict): 24 | def replace_(value): 25 | match = value.group() 26 | if match in replace_dict.keys(): 27 | return replace_dict[match] 28 | else: 29 | return match+" **Not Correct, Check it. Maybe mannual update & download is needed.**" 30 | 31 | replace_content = self.pattern.sub(replace_, content) 32 | 33 | return replace_content 34 | 35 | 36 | def note_modified(pattern_recog, md_file, **replace_dict): 37 | with open(md_file, 'r') as f: 38 | content = f.read() 39 | 40 | replaced_content = pattern_recog.multiple_replace(content, **replace_dict) 41 | 42 | with open(md_file, 'w') as f: 43 | f.write(''.join(replaced_content)) 44 | 45 | 46 | def get_pdf_paths(pdf_root): 47 | pdf_paths = [] 48 | for root, _, files in os.walk(pdf_root): 49 | for file in files: 50 | if file.lower().endswith('.pdf'): 51 | pdf_paths.append(os.path.join(root, file)) 52 | 53 | return pdf_paths 54 | 55 | 56 | def get_pdf_paths_from_notes(md_root, reg): 57 | 58 | md_files = [] 59 | for root, _, files in os.walk(md_root): 60 | for file in files: 61 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 62 | md_files.append(os.path.join(root, file)) 63 | 64 | pdf_paths_from_notes = [] 65 | for md_file in md_files: 66 | with open(md_file, 'r') as f: 67 | content = f.read() 68 | m = reg.findall(content) 69 | m = [i.split("(")[-1].split(')')[0] for i in m] 70 | pdf_paths_from_notes.extend(m) 71 | 72 | return pdf_paths_from_notes 73 | 74 | 75 | def get_pdf_paths_from_notes_dict(md_root, reg): 76 | pdf_paths_from_notes_dict = {} 77 | if os.path.isdir(md_root): 78 | md_files = [] 79 | for root, _, files in os.walk(md_root): 80 | for file in files: 81 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 82 | md_files.append(os.path.join(root, file)) 83 | 84 | for md_file in md_files: 85 | with open(md_file, 'r') as f: 86 | content = f.read() 87 | m = reg.findall(content) 88 | m = [i.split("(")[-1].split(')')[0] for i in m] 89 | pdf_paths_from_notes_dict[md_file] = m 90 | else: 91 | with open(md_root, 'r') as f: 92 | content = f.read() 93 | m = reg.findall(content) 94 | m = [i.split("(")[-1].split(')')[0] for i in m] 95 | pdf_paths_from_notes_dict[md_root] = m 96 | 97 | return pdf_paths_from_notes_dict 98 | 99 | 100 | def classify_identifier(identifier): 101 | """Not need to download PDF file 102 | """ 103 | if identifier.endswith("}}"): 104 | return True 105 | else: 106 | return False 107 | 108 | 109 | def get_update_content(m, note_file, pdfs_path, proxy, gproxy_mode): 110 | 111 | replace_dict = dict() 112 | for literature in tqdm(m): 113 | pdf = classify_identifier(literature) 114 | 115 | literature_id = literature.split('{')[-1].split('}')[0] 116 | bib = get_paper_info_from_paperid(literature_id, proxy=proxy, gproxy_mode=gproxy_mode) 117 | 118 | if bib: 119 | try: 120 | pdf_name = bib['title'] 121 | # remove blank symbol, like \n, \t, \r 122 | pdf_name = re.sub(r'[\n\t\r]', '', pdf_name) 123 | # remove multiple blank spaces 124 | pdf_name = re.sub(r' +', ' ', pdf_name) 125 | pdf_name = re.sub(r'[.]', '', pdf_name) 126 | 127 | pdf_name = '_'.join(pdf_name.split(' ')) + '.pdf' 128 | 129 | # remove the special characters in the pdf name: / \ : * ? " < > | 130 | pdf_name = re.sub(r'[\\/:*?"<>|]', '', pdf_name) 131 | pdf_path = os.path.join(pdfs_path, pdf_name) 132 | 133 | logger.info(f"The pdf path to be saved: {pdf_path}") 134 | if pdf: 135 | id_type = classify(literature_id) 136 | if id_type == "title": 137 | for pattern_str in [r'10\.(?!1101)[0-9]{4}/', r'10\.1101/', r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}', r'.*/[0-9]{2}[0-1][0-9]{4}']: 138 | res = re.search(pattern_str, bib['url']) # search for the arxiv id in the url 139 | if res: 140 | literature_id = res.group(0) 141 | if bib['pdf_link'] is None: 142 | bib['pdf_link'] = f'https://arxiv.org/pdf/{literature_id}.pdf' 143 | logger.info(f"The paper's arxiv url: {bib['url']}; The converted arxiv id: {literature_id}; The pdf link: {bib['pdf_link']}.") 144 | if not os.path.exists(pdf_path): 145 | logger.info(f"PDF link: {bib['pdf_link']}") 146 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy) 147 | if not os.path.exists(pdf_path): 148 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy) 149 | else: 150 | if not os.path.exists(pdf_path): 151 | logger.info(f"PDF link: {bib['pdf_link']}") 152 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy) 153 | if not os.path.exists(pdf_path): 154 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy) 155 | if os.path.exists(pdf_path): 156 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 157 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 158 | bib['year'], bib['cited_count'], os.path.relpath(pdf_path, note_file).split('/',1)[-1], 159 | bib['url']) 160 | else: 161 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 162 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 163 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 164 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url'] 165 | ) 166 | replace_dict[literature] = replaced_literature 167 | except: 168 | 169 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 170 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 171 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 172 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url'] 173 | ) 174 | replace_dict[literature] = replaced_literature 175 | else: 176 | logger.info("Can not find the literature {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 177 | replaced_literature = "- **{}**. ([pdf]({})).".format( 178 | literature_id, f'{pdfs_path}/your_pdf_name.pdf' 179 | ) 180 | replace_dict[literature] = replaced_literature 181 | return replace_dict -------------------------------------------------------------------------------- /easy_literature/DBLP.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from urllib.request import ProxyHandler 3 | from . import dblp_source as dblp 4 | import pandas as pd 5 | 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('DBLP') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class DBLPInfo(object): 13 | 14 | def set_proxy(self, proxy_address = None): 15 | """set proxy handler 16 | 17 | Aargs: 18 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 19 | 20 | Returns: 21 | A proxy handler object. 22 | """ 23 | pass 24 | 25 | 26 | def extract_json_info(self, item): 27 | """Extract bib json information from requests.get().json() 28 | 29 | Args: 30 | item (json object): obtained by requests.get().json() 31 | 32 | Returns: 33 | A dict containing the paper information. 34 | """ 35 | trial_num = 0 36 | while trial_num<10: 37 | trial_num+=1 38 | try: 39 | results = dblp.search([item]) 40 | break 41 | except: 42 | if trial_num == 10: 43 | results = pd.DataFrame({'A' : []}) 44 | else: 45 | pass 46 | 47 | 48 | 49 | if not results.empty: 50 | if 'CoRR' in [str(venue) for venue in results['Where']]: 51 | journal = 'CoRR' 52 | for venue in results['Where']: 53 | if str(venue) != 'CoRR': 54 | journal = str(venue) 55 | break 56 | 57 | str(results['Where']) 58 | bib_dict = { 59 | "title": str(results['Title'][0]), 60 | "author": ' and '.join([str(Entry) for Entry in results['Authors'][0]]), 61 | "journal": journal, 62 | "year": str(results['Year'][0]), 63 | "url": str(results['Link'][0]), 64 | "pdf_link": None, 65 | "cited_count": None 66 | } 67 | else: 68 | bib_dict = None 69 | return bib_dict 70 | 71 | 72 | def get_info_by_title(self, title): 73 | """Get the meta information by the given paper title. 74 | 75 | Args: 76 | doi (str): The paper title 77 | 78 | Returns: 79 | A dict containing the paper information. 80 | { 81 | "title": xxx, 82 | "author": xxx, 83 | "journal": xxx, 84 | etc 85 | } 86 | OR 87 | None 88 | OR 89 | A list [{}, {}, {}] 90 | """ 91 | return self.extract_json_info(title) 92 | 93 | 94 | if __name__ == "__main__": 95 | # arxivId = "2208.05623" 96 | # title = "Heterogeneous Graph Attention Network" 97 | 98 | # gscholar_info = GscholarInfo() 99 | # gscholar_info.set_proxy(proxy_name='single') 100 | 101 | # bib_arxiv = gscholar_info.get_info_by_title(title) 102 | # # bib_title = arxiv_info.get_info_by_title(title) 103 | 104 | # print(bib_arxiv) 105 | # print("\n") 106 | # # print(bib_title) 107 | results = dblp.search(["Finetunedlanguage models are zero-shot learners"]) 108 | 109 | print(results) -------------------------------------------------------------------------------- /easy_literature/GoogleScholar.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scholarly import scholarly, ProxyGenerator 3 | 4 | 5 | logging.basicConfig() 6 | logger = logging.getLogger('GoogleScholar') 7 | logger.setLevel(logging.DEBUG) 8 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 9 | 10 | class GscholarInfo(object): 11 | 12 | def set_proxy(self, proxy_name = "free", proxy_address = None): 13 | """set proxy handler 14 | 15 | Aargs: 16 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 17 | 18 | Returns: 19 | A proxy handler object. 20 | """ 21 | # TODO find a better proxy strategy 22 | if proxy_address: 23 | sucess = False 24 | pg = ProxyGenerator() 25 | if proxy_name == "free": 26 | sucess = pg.FreeProxies() 27 | elif proxy_name == "single": 28 | sucess = pg.SingleProxy(http = proxy_address, https = proxy_address) 29 | elif proxy_name == "Scraper": 30 | sucess = pg.ScraperAPI('a44bd5be9f56b1be9d6e40116ea4b440') 31 | logger.info(f'Scholarly using {proxy_name} proxy.') 32 | logger.info(f'Proxy setup sucess: {sucess}.') 33 | scholarly.use_proxy(pg) 34 | 35 | 36 | def extract_json_info(self, item): 37 | """Extract bib json information from requests.get().json() 38 | 39 | Args: 40 | item (json object): obtained by requests.get().json() 41 | 42 | Returns: 43 | A dict containing the paper information. 44 | """ 45 | bib_dict = None 46 | trial_num = 0 47 | 48 | while trial_num<9: 49 | try: 50 | trial_num+=1 51 | pubs_iter = scholarly.search_pubs(item) 52 | dictinfo = next(pubs_iter) 53 | # logger.info(dictinfo) 54 | bib_dict = { 55 | "title": dictinfo['bib']['title'].replace('\n', ''), 56 | "author": ' and '.join(dictinfo['bib']['author']), 57 | "journal": dictinfo['bib']['venue'], 58 | "year": dictinfo['bib']['pub_year'], 59 | "url": dictinfo['pub_url'], 60 | "pdf_link": dictinfo['eprint_url'], 61 | "cited_count": dictinfo['num_citations'] 62 | } 63 | break 64 | except: 65 | pass 66 | 67 | return bib_dict 68 | 69 | 70 | 71 | def get_info_by_title(self, title): 72 | """Get the meta information by the given paper title. 73 | 74 | Args: 75 | doi (str): The paper title 76 | 77 | Returns: 78 | A dict containing the paper information. 79 | { 80 | "title": xxx, 81 | "author": xxx, 82 | "journal": xxx, 83 | etc 84 | } 85 | OR 86 | None 87 | OR 88 | A list [{}, {}, {}] 89 | """ 90 | return self.extract_json_info(title) 91 | 92 | 93 | if __name__ == "__main__": 94 | arxivId = "2208.05623" 95 | title = "Heterogeneous Graph Attention Network" 96 | 97 | gscholar_info = GscholarInfo() 98 | gscholar_info.set_proxy(proxy_name='free') 99 | 100 | bib_arxiv = gscholar_info.get_info_by_title(title) 101 | # bib_title = arxiv_info.get_info_by_title(title) 102 | 103 | print(bib_arxiv) 104 | print("\n") 105 | # print(bib_title) -------------------------------------------------------------------------------- /easy_literature/Scholarly.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scholarly import scholarly 3 | from scholarly import ProxyGenerator 4 | 5 | # Set up a ProxyGenerator object to use free proxies 6 | # This needs to be done only once per session 7 | pg = ProxyGenerator() 8 | 9 | sucess = pg.FreeProxies() 10 | # print(f'Proxy setup sucess: {sucess}.') 11 | scholarly.use_proxy(pg) 12 | 13 | # will paginate to the next page by default 14 | pubs_iter = scholarly.search_pubs("1810.04805") 15 | 16 | 17 | print(json.dumps(next(pubs_iter), indent=2)) 18 | -------------------------------------------------------------------------------- /easy_literature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/easy_literature/__init__.py -------------------------------------------------------------------------------- /easy_literature/arxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from urllib.request import ProxyHandler 3 | import feedparser 4 | try: 5 | from urllib import quote 6 | except ImportError: 7 | from urllib.parse import quote 8 | from unidecode import unidecode 9 | 10 | from .crossref import crossrefInfo 11 | 12 | 13 | logging.basicConfig() 14 | logger = logging.getLogger('arxiv') 15 | logger.setLevel(logging.DEBUG) 16 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 17 | 18 | class arxivInfo(object): 19 | def __init__(self): 20 | self.base_url = "http://export.arxiv.org/api/query" 21 | 22 | def set_proxy_handler(self, proxy): 23 | """set proxy handler 24 | 25 | Aargs: 26 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123 27 | 28 | Returns: 29 | A proxy handler object. 30 | """ 31 | proxy_handler = ProxyHandler({"http": f"http://{proxy}", 32 | "https": f"https://{proxy}"}) 33 | return proxy_handler 34 | 35 | 36 | def extract_json_info(self, item): 37 | """Extract bib json information from requests.get().json() 38 | 39 | Args: 40 | item (json object): obtained by requests.get().json() 41 | 42 | Returns: 43 | A dict containing the paper information. 44 | """ 45 | paper_url = item.link 46 | title = item.title 47 | journal = "arxiv" 48 | published = item.published.split("-") 49 | if len(published) > 1: 50 | year = published[0] 51 | else: 52 | year = ' ' 53 | 54 | authors = item.authors 55 | if len(authors) > 0: 56 | first_author = authors[0]["name"].split(" ") 57 | authors = " and ".join([author["name"] for author in authors]) 58 | else: 59 | first_author = authors 60 | authors = authors 61 | 62 | bib_dict = { 63 | "title": title, 64 | "author": authors, 65 | "journal": journal, 66 | "year": year, 67 | "url": paper_url, 68 | "pdf_link": item.link.replace("abs", "pdf")+".pdf", 69 | "cited_count": None 70 | } 71 | 72 | return bib_dict 73 | 74 | 75 | def get_info_by_arxivid(self, arxivId, handler=False): 76 | """Get the meta information by the given paper arxiv_id. 77 | 78 | Args: 79 | doi (str): The arxiv Id 80 | handler (handler object): use proxy 81 | 82 | Returns: 83 | A dict containing the paper information. 84 | { 85 | "title": xxx, 86 | "author": xxx, 87 | "journal": xxx, 88 | etc 89 | } 90 | OR 91 | None 92 | """ 93 | 94 | params = "?search_query=id:"+quote(unidecode(arxivId)) 95 | 96 | try: 97 | if handler: 98 | result = feedparser.parse(self.base_url + params, handlers=[handler]) 99 | else: 100 | result = feedparser.parse(self.base_url + params) 101 | items = result.entries 102 | 103 | item = items[0] 104 | if "arxiv_doi" in item: 105 | doi = item["arxiv_doi"] 106 | 107 | crossref_info = crossrefInfo() 108 | if handler: 109 | crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1]) 110 | return crossref_info.get_info_by_doi(doi) 111 | else: 112 | return self.extract_json_info(item) 113 | except: 114 | logger.error("DOI: {} is error.".format(arxivId)) 115 | 116 | 117 | def get_info_by_title(self, title, field='ti'): 118 | """Get the meta information by the given paper title. 119 | 120 | Args: 121 | doi (str): The paper title 122 | 123 | Returns: 124 | A dict containing the paper information. 125 | { 126 | "title": xxx, 127 | "author": xxx, 128 | "journal": xxx, 129 | etc 130 | } 131 | OR 132 | None 133 | OR 134 | A list [{}, {}, {}] 135 | """ 136 | params = "?search_query="+field+":"+quote(unidecode(title)) 137 | url = self.base_url + params 138 | try: 139 | result = feedparser.parse(url) 140 | items = result.entries 141 | print(len(items)) 142 | 143 | for i, item in enumerate(items): 144 | 145 | title_item = item.title 146 | try: 147 | title_item = title_item.decode("utf-8") 148 | except: 149 | pass 150 | 151 | item.title = title_item 152 | 153 | if title_item.lower() == title.lower(): 154 | return self.extract_json_info(item) 155 | 156 | items[i] = item 157 | 158 | return [self.extract_json_info(it) for it in items] 159 | except: 160 | logger.error("Title: {} is error.".format(title)) 161 | 162 | 163 | if __name__ == "__main__": 164 | arxivId = "2208.05623" 165 | title = "Heterogeneous Graph Attention Network" 166 | 167 | arxiv_info = arxivInfo() 168 | arxiv_info.set_proxy_handler(proxy="127.0.1:1123") 169 | 170 | bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId) 171 | 172 | print(bib_arxiv) 173 | print("\n") -------------------------------------------------------------------------------- /easy_literature/crossref.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | # 4 | # 1. get info by doi 5 | # 2. get info by title 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('crossref') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class crossrefInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "http://api.crossref.org/" 17 | 18 | def set_proxy(self, proxy=None): 19 | """set proxy for session 20 | 21 | Args: 22 | proxy (str): The proxy adress. e.g 127.0.1:1123 23 | Returns: 24 | None 25 | """ 26 | if proxy: 27 | self.sess.proxies = { 28 | "http": proxy, 29 | "https": proxy, } 30 | 31 | 32 | def extract_json_info(self, bib): 33 | """Extract bib json information from requests.get().json() 34 | 35 | Args: 36 | bib (json object): obtained by requests.get().json() 37 | 38 | Returns: 39 | A dict containing the paper information. 40 | """ 41 | pub_date = [str(i) for i in bib['published']["date-parts"][0]] 42 | pub_date = '-'.join(pub_date) 43 | 44 | if 'author' in bib.keys(): 45 | authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()]) 46 | else: 47 | authors = "No author" 48 | 49 | if 'short-container-title' in bib.keys(): 50 | try: 51 | journal = bib['short-container-title'][0] 52 | except: 53 | journal = "No journal" 54 | else: 55 | try: 56 | journal = bib['container-title'][0] 57 | except: 58 | journal = "No journal" 59 | 60 | bib_dict = { 61 | "title": bib['title'][0], 62 | "author": authors, 63 | "journal": journal, 64 | "year": pub_date, 65 | "url": bib["URL"], 66 | "pdf_link": bib["link"][0]["URL"], 67 | "cited_count": bib["is-referenced-by-count"] 68 | } 69 | 70 | return bib_dict 71 | 72 | 73 | def get_info_by_doi(self, doi): 74 | """Get the meta information by the given paper DOI number. 75 | 76 | Args: 77 | doi (str): The paper DOI number 78 | 79 | Returns: 80 | A dict containing the paper information. 81 | { 82 | "title": xxx, 83 | "author": xxx, 84 | "journal": xxx, 85 | etc 86 | } 87 | OR 88 | None 89 | """ 90 | url = "{}works/{}" 91 | url = url.format(self.base_url, doi) 92 | 93 | try: 94 | r = self.sess.get(url) 95 | 96 | bib = r.json()['message'] 97 | return self.extract_json_info(bib) 98 | 99 | except: 100 | logger.error("DOI: {} is error.".format(doi)) 101 | 102 | 103 | def get_info_by_title(self, title): 104 | """Get the meta information by the given paper title. 105 | 106 | Args: 107 | doi (str): The paper title 108 | 109 | Returns: 110 | A dict containing the paper information. 111 | { 112 | "title": xxx, 113 | "author": xxx, 114 | "journal": xxx, 115 | etc 116 | } 117 | OR 118 | None 119 | OR 120 | A list [{}, {}, {}] 121 | """ 122 | url = self.base_url + "works" 123 | params = {"query.bibliographic": title, "rows": 20} 124 | try: 125 | r = self.sess.get(url, params=params) 126 | items = r.json()["message"]["items"] 127 | 128 | for i, item in enumerate(items): 129 | 130 | title_item = item['title'][0] 131 | try: 132 | title_item = title_item.decode("utf-8") 133 | except: 134 | pass 135 | 136 | item["title"][0] = title_item 137 | 138 | if title_item.lower() == title.lower(): 139 | return self.extract_json_info(item) 140 | 141 | items[i] = item 142 | 143 | return [self.extract_json_info(it) for it in items] 144 | except: 145 | logger.error("Title: {} is error.".format(title)) 146 | 147 | 148 | if __name__ == "__main__": 149 | # doi = "10.1016/j.wneu.2012.11.074" 150 | # doi = "10.1093/cercor/bhac266" 151 | doi = "10.1038/s41467-022-29269-6" 152 | # title = "Heterogeneous Graph Attention Network" 153 | # title = "Learning to Copy Coherent Knowledge for Response Generation" 154 | 155 | crossref_info = crossrefInfo() 156 | crossref_info.set_proxy(proxy="127.0.1:1123") 157 | 158 | bib_doi = crossref_info.get_info_by_doi(doi) 159 | # bib_title = crossref_info.get_info_by_title(title) 160 | 161 | print(bib_doi) 162 | print("\n") 163 | # print(bib_title) 164 | -------------------------------------------------------------------------------- /easy_literature/dblp_source.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import requests 4 | 5 | #options 6 | STRINGS_FOR_TEST = ["Collaborative Writing"] 7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/' 8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/" 9 | 10 | 11 | def query_db(pub_string=STRINGS_FOR_TEST): 12 | ''' 13 | returns the BeautifulSoup object of a query to DBLP 14 | 15 | :param pub_string: A list of strings of keywords 16 | :return: BeautifulSoup: A BeautifulSoup Object 17 | ''' 18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string}) 19 | return BeautifulSoup(resp.content) 20 | 21 | def get_pub_data(pub): 22 | ''' 23 | Extracts the information about a publication from a BeautifulSoup object 24 | 25 | :param pub: A BeautifulSoup Object with Publication Information 26 | :return: dict: All Information of this Publication 27 | ''' 28 | ptype = 'nothing' 29 | link = 'nothing' 30 | authors = [] 31 | title = 'nothing' 32 | where = 'nothing' 33 | 34 | if 'year' in pub.get('class'): 35 | # year is not always scrapable, except for this case. Might be done more elegantly 36 | return int(pub.contents[0]) 37 | else: 38 | ptype = pub.attrs.get('class')[1] 39 | for content_item in pub.contents: 40 | class_of_content_item = content_item.attrs.get('class', [0]) 41 | if 'data' in class_of_content_item: 42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}): 43 | authors.append(author.text) 44 | title = content_item.find('span', attrs={"class": "title"}).text 45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}): 46 | found_where = where_data.find('span', attrs={"itemprop": "name"}) 47 | if found_where: 48 | where = found_where.text 49 | elif 'publ' in class_of_content_item: 50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing") 51 | 52 | return {'Type': ptype, 53 | 'Link': link, 54 | 'Authors': authors, 55 | 'Title': title, 56 | 'Where': where} 57 | 58 | def search(search_string=STRINGS_FOR_TEST): 59 | ''' 60 | returns the information found in a search query to dblp as a pandas dataframe. 61 | Shows the following information: 62 | - Authors 63 | - Link to Publication 64 | - Title 65 | - Type (Article, Proceedings etc.) 66 | - Where it was published 67 | - Year of publication 68 | :param search_string: A List of Strings of Keywords, that should be searched for 69 | :return: pd.DataFrame: A Dataframe with all data 70 | ''' 71 | soup = query_db(search_string) 72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"}) 73 | 74 | pub_list_data = [] 75 | curr_year = 0 76 | for child in pub_list_raw.children: 77 | pub_data = get_pub_data(child) 78 | if type(pub_data) == int: 79 | curr_year = pub_data 80 | else: 81 | pub_data['Year'] = curr_year 82 | pub_list_data.append(pub_data) 83 | 84 | return pd.DataFrame(pub_list_data) 85 | -------------------------------------------------------------------------------- /easy_literature/downloads.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import os 4 | import platform 5 | 6 | from .arxiv import arxivInfo 7 | from .crossref import crossrefInfo 8 | from .medbiorxiv import BMxivInfo 9 | from .GoogleScholar import GscholarInfo 10 | from .DBLP import DBLPInfo 11 | from .pdfs import pdfDownload 12 | 13 | # log config 14 | logging.basicConfig() 15 | logger = logging.getLogger('Downloads') 16 | logger.setLevel(logging.INFO) 17 | 18 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 19 | 20 | 21 | 22 | def check_string(re_exp, str): 23 | res = re.match(re_exp, str) 24 | if res: 25 | return True 26 | else: 27 | return False 28 | 29 | def classify(identifier): 30 | """ 31 | Classify the type of paper_id: 32 | arxivId - arxivId 33 | doi - digital object identifier 34 | medbiorxivId - medrxiv or biorxiv id 35 | title - title 36 | """ 37 | if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier): 38 | return 'doi' 39 | elif check_string(r'10\.1101/\.*', identifier): 40 | return "medbiorxivId" 41 | elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier): 42 | return 'arxivId' 43 | elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier): 44 | return 'title' 45 | else: 46 | return "unrecognized" 47 | 48 | def get_paper_info_from_paperid(paper_id, proxy=None, gproxy_mode='free'): 49 | id_type = classify(paper_id) 50 | 51 | if id_type == "doi": 52 | logger.info('ID type: doi.') 53 | downloader = crossrefInfo() 54 | if proxy: 55 | downloader.set_proxy(proxy=proxy) 56 | bib_dict = downloader.get_info_by_doi(paper_id) 57 | 58 | elif id_type == "arxivId": 59 | logger.info('ID type: arixiv.') 60 | downloader = arxivInfo() 61 | if proxy: 62 | downloader.set_proxy_handler(proxy=proxy) 63 | bib_dict = downloader.get_info_by_arxivid(paper_id) 64 | 65 | elif id_type == "medbiorxivId": 66 | logger.info('ID type: medbiorxivId.') 67 | downloader = BMxivInfo() 68 | if proxy: 69 | downloader.set_proxy(proxy=proxy) 70 | bib_dict = downloader.get_info_by_bmrxivid(paper_id) 71 | 72 | elif id_type == "title": 73 | logger.info('ID type: title.') 74 | downloader1 = GscholarInfo() 75 | downloader1.set_proxy(proxy_name=gproxy_mode, proxy_address=proxy) 76 | bib_dict = downloader1.get_info_by_title(paper_id) 77 | 78 | downloader2 = DBLPInfo() 79 | downloader2.set_proxy(proxy_address=proxy) 80 | bib_dict1 = downloader2.get_info_by_title(paper_id) 81 | 82 | logger.info(f'The Google scholar bib: {bib_dict}; The DLBP bib: {bib_dict1}.') 83 | 84 | if bib_dict is not None and bib_dict1 is not None: 85 | bib_dict['journal'] = bib_dict1['journal'] 86 | elif bib_dict is None and bib_dict1 is not None: 87 | bib_dict = bib_dict1 88 | elif bib_dict is None and bib_dict1 is None: 89 | logger.info('Title not found on DLBP and Google scholar.') 90 | else: 91 | pass 92 | 93 | try: 94 | return bib_dict 95 | except: 96 | pass 97 | 98 | 99 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None): 100 | pdf_downloader = pdfDownload() 101 | if proxy: 102 | pdf_downloader.set_proxy(proxy=proxy) 103 | 104 | if direct_url: 105 | content = pdf_downloader.get_pdf_from_direct_url(direct_url) 106 | if not content: 107 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 108 | else: 109 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id) 110 | try: 111 | system = platform.system() 112 | if system == 'Windows': 113 | path = path.replace("/", "\\") 114 | pdf_dir = path.rsplit("\\", 1)[0] 115 | else: 116 | pdf_dir = path.rsplit("/", 1)[0] 117 | if not os.path.exists(pdf_dir): 118 | os.makedirs(pdf_dir) 119 | pdf_downloader._save(content['pdf'], path) 120 | except: 121 | pass 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /easy_literature/easyliter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | import os 4 | 5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('easyliter') 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | 13 | 14 | def set_args(): 15 | parser = argparse.ArgumentParser(description='EasyLiterature') 16 | parser.add_argument('-i', '--input', required=True, type=str, default=None, 17 | help="The path to the note file or note file folder.") 18 | parser.add_argument('-o', '--output', type=str, default=None, 19 | help='Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.') 20 | parser.add_argument('-p', '--proxy', type=str, default=None, 21 | help='The proxy address. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.') 22 | parser.add_argument('-gp', '--gproxy_mode', type=str, default='free', 23 | help='The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).') 24 | parser.add_argument('-d', '--delete', action='store_true', 25 | help='Delete unreferenced attachments in notes. Use with caution, ' 26 | 'when used, -i must be a folder path including all notes.') 27 | parser.add_argument('-m', '--migration', type=str, default=None, 28 | help="The pdf folder path you want to reconnect to.") 29 | args = parser.parse_args() 30 | 31 | return args 32 | 33 | def check_args(): 34 | args = set_args() 35 | input_path = args.input 36 | output_path = args.output 37 | delete_bool = args.delete 38 | migration_path = args.migration 39 | proxy = args.proxy 40 | gproxy_mode = args.gproxy_mode 41 | 42 | return input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode 43 | 44 | 45 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer, gproxy_mode): 46 | 47 | pdfs_path = output_path 48 | if not os.path.exists(pdfs_path): 49 | os.makedirs(pdfs_path) 50 | 51 | with open(note_file, 'r') as f: 52 | content = f.read() 53 | 54 | m = paper_recognizer.findall(content) 55 | logger.info("Number of files to download - {}".format(len(m))) 56 | 57 | if not m: 58 | logger.info("The file {} is not found, or there is no valid entry in the file.".format(note_file)) 59 | else: 60 | replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy, gproxy_mode=gproxy_mode) 61 | 62 | return replace_dict 63 | 64 | 65 | def file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode): 66 | 67 | replace_dict = get_bib_and_pdf(input_path, output_path, 68 | proxy, paper_recognizer, gproxy_mode) 69 | 70 | if replace_dict: 71 | note_modified(paper_recognizer, input_path, **replace_dict) 72 | 73 | 74 | def main(): 75 | input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode = check_args() 76 | 77 | if output_path: 78 | paper_recognizer = patternRecognizer(r'- \{.{3,}\}') 79 | 80 | if os.path.isfile(input_path): 81 | logger.info("Updating the file {}".format(input_path)) 82 | file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode) 83 | 84 | elif os.path.isdir(input_path): 85 | note_paths = [] 86 | for root, _, files in os.walk(input_path): 87 | for file in files: 88 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 89 | note_paths.append(os.path.join(root, file)) 90 | for note_path in note_paths: 91 | logger.info("Updating the file {}".format(note_path)) 92 | file_update(note_path, output_path, proxy, paper_recognizer, gproxy_mode) 93 | else: 94 | logger.info("input path {} does not exist".format(input_path)) 95 | 96 | 97 | # Delete unreferenced attachments 98 | if delete_bool: 99 | if os.path.isfile(input_path): 100 | logger.info("To delete the PDF entities unrelated to the notes, the input path must be the main notes folder!!! Please use this parameter with caution!!!") 101 | else: 102 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 103 | pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer) 104 | pdf_paths = get_pdf_paths(output_path) 105 | # TODO the path between mac and win could be different,“/” 和 “\\” 106 | pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes] 107 | pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths] 108 | 109 | removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes)) 110 | try: 111 | for pdf_p in removed_pdf_paths: 112 | os.remove(pdf_p) 113 | except: 114 | pass 115 | 116 | logger.info("Deleted {} files".format(len(removed_pdf_paths))) 117 | 118 | 119 | if migration_path: 120 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)') 121 | 122 | pdf_paths = get_pdf_paths(migration_path) 123 | pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer) 124 | 125 | # match based on paper title 126 | matched_numb = 0 127 | pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths} 128 | for md_file, pdf_paths_ in pdf_paths_in_notes.items(): 129 | 130 | pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_} 131 | matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys() 132 | 133 | matched_numb += len(matched_pdfs) 134 | 135 | replace_paths_dict = {} 136 | for matched in matched_pdfs: 137 | replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1] 138 | replaced_str = "[pdf]({})".format(replaced_str) 139 | ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched]) 140 | replace_paths_dict[ori_str] = replaced_str 141 | 142 | if replace_paths_dict: 143 | note_modified(pdf_path_recognizer, md_file, **replace_paths_dict) 144 | 145 | logger.info("Found - {} - pdf files".format(matched_numb)) 146 | 147 | 148 | if not output_path and not migration_path: 149 | logger.info("lacking the arguments -o or -m, use -h to see the help") 150 | 151 | 152 | if __name__ == "__main__": 153 | main() -------------------------------------------------------------------------------- /easy_literature/medbiorxiv.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | from .crossref import crossrefInfo 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger('biorxiv') 9 | logger.setLevel(logging.DEBUG) 10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 11 | 12 | class BMxivInfo(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | self.base_url = "https://api.biorxiv.org/details/" 17 | self.servers = ["biorxiv", "medrxiv"] 18 | 19 | 20 | def set_proxy(self, proxy=False): 21 | """set proxy for session 22 | 23 | Args: 24 | proxy (str): The proxy adress. e.g 127.0.1:1123 25 | Returns: 26 | None 27 | """ 28 | if proxy: 29 | self.sess.proxies = { 30 | "http": proxy, 31 | "https": proxy, } 32 | 33 | 34 | def extract_json_info(self, item): 35 | """Extract bib json information from requests.get().json() 36 | 37 | Args: 38 | item (json object): obtained by requests.get().json() 39 | 40 | Returns: 41 | A dict containing the paper information. 42 | """ 43 | paper_url = f"https://www.biorxiv.org/content/{item['doi']}" 44 | title = item["title"] 45 | journal = item["server"] 46 | published = item["date"].split('-') 47 | if len(published) > 1: 48 | year = published[0] 49 | else: 50 | year = ' ' 51 | 52 | authors = item['authors'].split("; ") 53 | if len(authors) > 0: 54 | authors = " and ".join([author for author in authors]) 55 | else: 56 | authors = authors 57 | 58 | bib_dict = { 59 | "title": title, 60 | "author": authors, 61 | "journal": journal, 62 | "year": year, 63 | "url": paper_url, 64 | "pdf_link": f"{paper_url}.full.pdf", 65 | "cited_count": None 66 | } 67 | 68 | return bib_dict 69 | 70 | 71 | def get_info_by_bmrxivid(self, bmrxivid): 72 | """Get the meta information by the given paper biorxiv_id or medrxiv_id. 73 | 74 | Args: 75 | doi (str): The biorxiv or medrxiv Id 76 | 77 | Returns: 78 | A dict containing the paper information. 79 | { 80 | "title": xxx, 81 | "author": xxx, 82 | "journal": xxx, 83 | etc 84 | } 85 | OR 86 | None 87 | """ 88 | urls = [self.base_url + server + "/" + bmrxivid for server in self.servers] 89 | for url in urls: 90 | try: 91 | r = self.sess.get(url) 92 | 93 | bib = r.json()['collection'][-1] 94 | 95 | if "published" in bib.keys() and bib['published'] != "NA": 96 | doi = bib["published"] 97 | print(doi) 98 | crossref_info = crossrefInfo() 99 | if len(self.sess.proxies) > 0: 100 | crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1]) 101 | return crossref_info.get_info_by_doi(doi) 102 | 103 | return self.extract_json_info(bib) 104 | 105 | except: 106 | logger.error("DOI: {} is error.".format(bmrxivid)) 107 | 108 | 109 | def get_info_by_title(self, title): 110 | """Get the meta information by the given paper title. 111 | 112 | Args: 113 | doi (str): The paper title 114 | 115 | Returns: 116 | A dict containing the paper information. 117 | { 118 | "title": xxx, 119 | "author": xxx, 120 | "journal": xxx, 121 | etc 122 | } 123 | OR 124 | None 125 | OR 126 | A list [{}, {}, {}] 127 | """ 128 | base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard" 129 | query = title.replace(' ', '%252B') 130 | 131 | url = base_url.format(query) 132 | try: 133 | result = self.sess.get(url) 134 | soup = BeautifulSoup(result.content, "lxml") 135 | soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix") 136 | 137 | soup_dict = dict() 138 | for sp in soup_items: 139 | key = sp.find("a", class_="highwire-cite-linked-title").span.text 140 | value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "") 141 | soup_dict[key] = value 142 | 143 | for item_title, item_doi in soup_dict.items(): 144 | try: 145 | item_title = item_title.decode("utf-8") 146 | except: 147 | pass 148 | 149 | if item_title.lower() == title.lower(): 150 | return self.get_info_by_bmrxivid(item_doi) 151 | 152 | return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()] 153 | except: 154 | logger.error("Title: {} is error.".format(title)) 155 | 156 | 157 | if __name__ == "__main__": 158 | 159 | arxivId = "10.1101/2022.07.28.22277637" 160 | # title = "Oxygen restriction induces a viable but non-culturable population in bacteria" 161 | # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing" 162 | # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19" 163 | # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome" 164 | 165 | arxiv_info = BMxivInfo() 166 | arxiv_info.set_proxy(proxy="127.0.1:1123") 167 | 168 | bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId) 169 | # bib_title = arxiv_info.get_info_by_title(title) 170 | 171 | print(bib_arxiv) 172 | print("\n") 173 | # print(bib_title) -------------------------------------------------------------------------------- /easy_literature/pdfs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | from urllib.parse import urlunsplit, urlsplit 4 | from bs4 import BeautifulSoup 5 | 6 | logging.basicConfig() 7 | logger = logging.getLogger('PDFs') 8 | logger.setLevel(logging.DEBUG) 9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'} 10 | 11 | 12 | class pdfDownload(object): 13 | def __init__(self): 14 | self.sess = requests.Session() 15 | self.sess.headers = HEADERS 16 | 17 | def set_proxy(self, proxy=None): 18 | """set proxy for session 19 | 20 | Args: 21 | proxy (str): The proxy adress. e.g 127.0.1:1123 22 | Returns: 23 | None 24 | """ 25 | if proxy: 26 | self.sess.proxies = { 27 | "http": proxy, 28 | "https": proxy, } 29 | 30 | 31 | def _get_available_scihub_urls(self): 32 | ''' 33 | Finds available scihub urls via https://lovescihub.wordpress.com/ or 34 | https://sci-hub.now.sh/ 35 | ''' 36 | urls = [] 37 | res = self.sess.get('https://lovescihub.wordpress.com/') 38 | s = BeautifulSoup(res.content, 'html.parser') 39 | for a in s.find('div', class_="entry-content").find_all('a', href=True): 40 | if 'sci-hub.' in a['href']: 41 | urls.append(a['href']) 42 | return urls 43 | 44 | 45 | def fetch(self, url, auth=None): 46 | '''Fetch pdf 47 | 48 | Args: 49 | url (str): 50 | 51 | Returns: 52 | A dict OR None 53 | ''' 54 | try: 55 | r = self.sess.get(url, auth=auth) 56 | 57 | if r.headers["Content-Type"] != "application/pdf": 58 | logger.info("Failed to fetch pdf with url: {}".format(url)) 59 | else: 60 | return { 61 | 'pdf': r.content, 62 | 'url': url 63 | } 64 | except: 65 | logger.error("Failed to open url: {}".format(url)) 66 | 67 | 68 | def get_pdf_from_direct_url(self, url, auth=None): 69 | return self.fetch(url, auth=auth) 70 | 71 | 72 | def get_pdf_from_sci_hub(self, identifier, auth=None): 73 | '''Fetch pdf from sci-hub based on doi or url 74 | 75 | Args: 76 | identifier (str): DOI or url 77 | auth (tuple): ("user", "passwd") 78 | 79 | Returns: 80 | A dict OR None 81 | ''' 82 | for base_url in self._get_available_scihub_urls(): 83 | r = self.sess.get(base_url + '/' + identifier, auth=auth) 84 | soup = BeautifulSoup(r.content, 'html.parser') 85 | 86 | pdf_div_names = ['iframe', 'embed'] 87 | for pdf_div_name in pdf_div_names: 88 | pdf_div = soup.find(pdf_div_name) 89 | if pdf_div != None: 90 | break 91 | try: 92 | url_parts = urlsplit(pdf_div.get('src')) 93 | if url_parts[1]: 94 | if url_parts[0]: 95 | pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', '')) 96 | else: 97 | pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', '')) 98 | else: 99 | pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', '')) 100 | 101 | return self.fetch(pdf_url, auth) 102 | except: 103 | pass 104 | 105 | logger.info("Failed to fetch pdf with all sci-hub urls") 106 | 107 | def _save(self, content, path): 108 | with open(path, "wb") as f: 109 | f.write(content) 110 | 111 | 112 | if __name__ == "__main__": 113 | doi = "10.1145/3308558.3313562" 114 | 115 | pdf_download = pdfDownload() 116 | pdf_download.set_proxy("127.0.1:1123") 117 | 118 | pdf_dict = pdf_download.get_pdf_from_sci_hub(doi) 119 | if pdf_dict: 120 | print(pdf_dict['url']) 121 | pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf") 122 | 123 | # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf") 124 | # if pdf_dict2: 125 | # print(pdf_dict2['url']) 126 | # pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf") 127 | 128 | -------------------------------------------------------------------------------- /easy_literature/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import re 4 | from tqdm import tqdm 5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid, classify 6 | 7 | 8 | logging.basicConfig() 9 | logger = logging.getLogger('utils') 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | class patternRecognizer(object): 14 | def __init__(self, regular_rule): 15 | self.pattern = re.compile(regular_rule) 16 | 17 | def match(self, string): 18 | return self.pattern.match(string) 19 | 20 | def findall(self, string): 21 | return self.pattern.findall(string) 22 | 23 | def multiple_replace(self, content, **replace_dict): 24 | def replace_(value): 25 | match = value.group() 26 | if match in replace_dict.keys(): 27 | return replace_dict[match] 28 | else: 29 | return match+" **Not Correct, Check it. Maybe mannual update & download is needed.**" 30 | 31 | replace_content = self.pattern.sub(replace_, content) 32 | 33 | return replace_content 34 | 35 | 36 | def note_modified(pattern_recog, md_file, **replace_dict): 37 | with open(md_file, 'r') as f: 38 | content = f.read() 39 | 40 | replaced_content = pattern_recog.multiple_replace(content, **replace_dict) 41 | 42 | with open(md_file, 'w') as f: 43 | f.write(''.join(replaced_content)) 44 | 45 | 46 | def get_pdf_paths(pdf_root): 47 | pdf_paths = [] 48 | for root, _, files in os.walk(pdf_root): 49 | for file in files: 50 | if file.lower().endswith('.pdf'): 51 | pdf_paths.append(os.path.join(root, file)) 52 | 53 | return pdf_paths 54 | 55 | 56 | def get_pdf_paths_from_notes(md_root, reg): 57 | 58 | md_files = [] 59 | for root, _, files in os.walk(md_root): 60 | for file in files: 61 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 62 | md_files.append(os.path.join(root, file)) 63 | 64 | pdf_paths_from_notes = [] 65 | for md_file in md_files: 66 | with open(md_file, 'r') as f: 67 | content = f.read() 68 | m = reg.findall(content) 69 | m = [i.split("(")[-1].split(')')[0] for i in m] 70 | pdf_paths_from_notes.extend(m) 71 | 72 | return pdf_paths_from_notes 73 | 74 | 75 | def get_pdf_paths_from_notes_dict(md_root, reg): 76 | pdf_paths_from_notes_dict = {} 77 | if os.path.isdir(md_root): 78 | md_files = [] 79 | for root, _, files in os.walk(md_root): 80 | for file in files: 81 | if file.lower().endswith('md') or file.lower().endswith('markdown'): 82 | md_files.append(os.path.join(root, file)) 83 | 84 | for md_file in md_files: 85 | with open(md_file, 'r') as f: 86 | content = f.read() 87 | m = reg.findall(content) 88 | m = [i.split("(")[-1].split(')')[0] for i in m] 89 | pdf_paths_from_notes_dict[md_file] = m 90 | else: 91 | with open(md_root, 'r') as f: 92 | content = f.read() 93 | m = reg.findall(content) 94 | m = [i.split("(")[-1].split(')')[0] for i in m] 95 | pdf_paths_from_notes_dict[md_root] = m 96 | 97 | return pdf_paths_from_notes_dict 98 | 99 | 100 | def classify_identifier(identifier): 101 | """Not need to download PDF file 102 | """ 103 | if identifier.endswith("}}"): 104 | return True 105 | else: 106 | return False 107 | 108 | 109 | def get_update_content(m, note_file, pdfs_path, proxy, gproxy_mode): 110 | 111 | replace_dict = dict() 112 | for literature in tqdm(m): 113 | pdf = classify_identifier(literature) 114 | 115 | literature_id = literature.split('{')[-1].split('}')[0] 116 | bib = get_paper_info_from_paperid(literature_id, proxy=proxy, gproxy_mode=gproxy_mode) 117 | 118 | if bib: 119 | try: 120 | pdf_name = bib['title'] 121 | # remove blank symbol, like \n, \t, \r 122 | pdf_name = re.sub(r'[\n\t\r]', '', pdf_name) 123 | # remove multiple blank spaces 124 | pdf_name = re.sub(r' +', ' ', pdf_name) 125 | pdf_name = re.sub(r'[.]', '', pdf_name) 126 | 127 | pdf_name = '_'.join(pdf_name.split(' ')) + '.pdf' 128 | 129 | # remove the special characters in the pdf name: / \ : * ? " < > | 130 | pdf_name = re.sub(r'[\\/:*?"<>|]', '', pdf_name) 131 | pdf_path = os.path.join(pdfs_path, pdf_name) 132 | 133 | logger.info(f"The pdf path to be saved: {pdf_path}") 134 | if pdf: 135 | id_type = classify(literature_id) 136 | if id_type == "title": 137 | for pattern_str in [r'10\.(?!1101)[0-9]{4}/', r'10\.1101/', r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}', r'.*/[0-9]{2}[0-1][0-9]{4}']: 138 | res = re.search(pattern_str, bib['url']) # search for the arxiv id in the url 139 | if res: 140 | literature_id = res.group(0) 141 | if bib['pdf_link'] is None: 142 | bib['pdf_link'] = f'https://arxiv.org/pdf/{literature_id}.pdf' 143 | logger.info(f"The paper's arxiv url: {bib['url']}; The converted arxiv id: {literature_id}; The pdf link: {bib['pdf_link']}.") 144 | if not os.path.exists(pdf_path): 145 | logger.info(f"PDF link: {bib['pdf_link']}") 146 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy) 147 | if not os.path.exists(pdf_path): 148 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy) 149 | else: 150 | if not os.path.exists(pdf_path): 151 | logger.info(f"PDF link: {bib['pdf_link']}") 152 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy) 153 | if not os.path.exists(pdf_path): 154 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy) 155 | if os.path.exists(pdf_path): 156 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 157 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 158 | bib['year'], bib['cited_count'], os.path.relpath(pdf_path, note_file).split('/',1)[-1], 159 | bib['url']) 160 | else: 161 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 162 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 163 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 164 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url'] 165 | ) 166 | replace_dict[literature] = replaced_literature 167 | except: 168 | 169 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 170 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format( 171 | bib['title'], bib["author"].split(" and ")[0], bib['journal'], 172 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url'] 173 | ) 174 | replace_dict[literature] = replaced_literature 175 | else: 176 | logger.info("Can not find the literature {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id)) 177 | replaced_literature = "- **{}**. ([pdf]({})).".format( 178 | literature_id, f'{pdfs_path}/your_pdf_name.pdf' 179 | ) 180 | replace_dict[literature] = replaced_literature 181 | return replace_dict -------------------------------------------------------------------------------- /easyliter.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: easyliter 3 | Version: 1.0.5 4 | Summary: EasyLiterature is a opensourced, Python-based command line tool for automatic literature management. Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically collect and refine its information in the markdown file, download the pdf to your local machine, and link the pdf to your paper in the markdown file. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver. 5 | Home-page: https://github.com/Psycoy/EasyLiterature 6 | Author: Oliver 7 | Author-email: olivernova1998@gmail.com 8 | License: AGPLv3 9 | Keywords: title,bibtex,arxiv,doi,science,scientific-journals 10 | Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+) 11 | Classifier: Intended Audience :: Science/Research 12 | Classifier: Programming Language :: Python :: 3 13 | Classifier: Topic :: Text Processing :: Markup 14 | Description-Content-Type: text/markdown 15 | License-File: LICENSE 16 | 17 | # EasyLiterature 18 | **EasyLiterature** is a Python-based command line tool for automatic literature management. Welcome star or contribute! 19 | 20 | Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically `collect and refine its information in the markdown file`, `download the pdf to your local machine`, and `link the pdf to your paper in the markdown file`. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver. 21 | 22 |
23 | 24 | **A demo of the entries in your markdown note:** 25 | 26 | demo 27 | 28 |
29 | 30 | Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4), adapted from [autoLiterature](https://github.com/wilmerwang/autoLiterature). 31 | Compared to autoLiterature, **EasyLiterature** is much easier to use and supports a wider range of features, such as `title-based paper match`, `paper search and download on Google Scholar and DLBP` (the two main sites for scholars), `citation statistics`, `mannual information update assitant`, etc. **EasyLiterature covers almost all papers thanks to the support of Google Scholar and DLBP!** 32 | 33 | ___ 34 | 35 | **中文版介绍:** 36 | 37 | **EasyLiterature** 是一个基于python的命令行文件管理工具,永久开源,欢迎star或contribute。 38 | 39 | 之前沐神(李沐)做过一期视频讲如何阅读文献和整理,我觉得讲得非常好,[链接](https://www.bilibili.com/video/BV1nA41157y4)。EasyLiterature基本基于沐神所述的这一流程实现,并丰富了其他功能。 40 | 41 | 简单来说,在 Markdown 文件中简单列出想要阅读的论文标题(或ID),它会自动收集并在Markdown文件中完善相关信息,下载论文的PDF到本地机器,并将PDF链接到Markdown文件中的论文。通过这样的流程,我们可以实现永久保存实时编辑的论文PDF和Markdown中的笔记,无论是在本地机器还是云端,并且方便论文一站式分类和管理。 42 | 43 |
44 | 45 | **markdown文件中的论文信息条目(示意):** 46 | 47 | demo 48 | 49 |
50 | 51 | 与之前的实现相比,EasyLiterature兼容之前实现的所有功能,并且支持更多功能,比如:1. 基于标题的论文匹配;2. Google Scholar和DLBP(全球两大主要paper数据库)的论文搜索和下载;3. 引用统计;4. 手动信息更新助手;5. 容错搜索匹配;等等。之前的实现由于数据库的限制,很多文章都找不到。**EasyLiterature得益于增加了Google Scholar和DLBP的支持,几乎覆盖了所有论文!** 52 | 53 |

54 | 55 | ## 1. A Simple Usage Example (一个简单的使用示例) 56 | 1. Have the python installed on your local machine (preferably >= 3.7). 57 | 2. Run `pip install easyliter` in your command line to install. 58 | 3. Prepare your markdown note file (e.g., `Note.md`).
**Attention:** You may need to download a markdown editor to create/edit this file. I am using [Typora](https://typora.io/), which is not totally free. You can also choose other alternatives. 59 | 4. List the formated papers titles in your markdown note file according to the Section 4 below (Recognition Rules). e.g.,
60 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
61 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
62 | **(pay attention to the space after ‘\-’)** 63 | 5. Create a folder to store the downloaded pdfs (e.g., `PDFs/`). 64 | 6. Run `easyliter -i -o `. 65 |
(Replace `` with the actual path to your markdown note file, `` with the actual path to your pdf folder) 66 |
e.g., `easyliter -i "/home/Note.md" -o "/home/PDFs"` 67 | 7. Your should able to see that the updated information and downloaded pdf files if no error is reported. 68 | 8. This is a simple and common use case. For other features, please read the below sections carefully and follow the instructions. 69 | 70 |
71 | 72 | **中文版示例** 73 | 74 | 1. 在您的本地机器上安装 Python(版本 >= 3.7)。 75 | 2. 在命令行中运行 `pip install easyliter` 进行安装。 76 | 3. 准备您的 markdown 笔记文件(例如,`Note.md`)。
**注意**: 您需要下载一个 markdown 编辑器来创建/编辑此文件。我使用的是[Typora](https://typora.io/),它不是完全免费的。您也可以选择其他替代产品。 77 | 4. 根据下面第4节(识别规则)在您的 markdown 笔记文件中列出格式化的论文标题。例如:
78 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
79 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
80 | **(注意‘\-’后的空格)** 81 | 5. 创建一个文件夹来存储下载的 pdf 文件(例如,`PDFs/`)。 82 | 6. 运行 `easyliter -i <您的 md 文件路径> -o <您的 pdf 文件夹路径>`。 83 |
**注意**:将 `<您的 md 文件路径>` 替换为您 markdown 笔记文件的实际路径,将 `<您的 pdf 文件夹路径>` 替换为您 pdf 文件夹的实际路径。 84 |
例如:`easyliter -i "/home/Note.md" -o "/home/PDFs"` 85 | 7. 如果没有报错,您应该能够看到更新的信息和下载的 pdf 文件。 86 | 8. 这是一个简单、常用的使用案例。有关其他功能或使用情形,请仔细阅读以下部分并按照说明操作。 87 | 88 | ## 2. Install (安装) 89 | ### pip install 90 | ```bash 91 | pip install easyliter 92 | or 93 | pip3 install easyliter 94 | ``` 95 | 96 | ### install from source(to get the up-to-date version) 97 | ```bash 98 | git clone https://github.com/Psycoy/EasyLiterature.git 99 | cd EasyLiterature 100 | pip install -e . 101 | ``` 102 | 103 | ## 3. Arguments(使用参数) 104 | ```bash 105 | easyliter 106 | 107 | optional arguments: 108 | 109 | -h, --help show this help message and exit 110 | 111 | -i INPUT, --input INPUT 112 | The path to the note file or note file folder. 113 | 114 | -o OUTPUT, --output OUTPUT 115 | Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER. 116 | 117 | -p PROXY, --proxy PROXY 118 | The proxy. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html. 119 | 120 | -gp GPROXY_MODE, --gproxy_mode GPROXY_MODE 121 | The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.). 122 | 123 | -d, --delete 124 | Delete unreferenced attachments in notes. Use with caution, when used, -i must be a folder path including all notes. 125 | 126 | -m MIGRATION, --migration MIGRATION 127 | The pdf folder path you want to reconnect to. 128 | ``` 129 | 130 | 131 | ## 4. Recognition Rules (识别规则): 132 | - If the notes file contains `- {paper_id}`, it will download the information of that literature, but not the PDF. 133 | - If the notes file contains `- {{paper_id}}`, it will download both the information of that literature and the PDF. 134 | 135 | - Note: `paper_id` supports `article title`, published articles' `doi`, and pre-published articles' `arvix_id`, `biorvix_id`, and `medrvix_id`. It will try all the possible sources online. 136 | 137 | ___ 138 | 139 | - 当笔记文件中包含 `- {paper_id}`时候,会下载该文献的信息,不下载PDF。 140 | - 当笔记文件中包含 `- {{paper_id}}`时候,会下载该文献的信息,以及PDF。 141 | 142 | - 注意:`paper_id` 支持`文章标题`,已发表文章的`doi`, 预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。EasyLiterature会从多个数据库自动识别需要收集和下载的论文,几乎覆盖所有目前存在的论文。 143 | 144 | 145 | ## 5. Usage(使用) 146 | ### 5.1. Basic Usage(基本使用) 147 | Assuming `input` is the folder path of the literature notes (.md files) and `output` is the folder path where you want to save the PDFs. 148 | 149 | 假设`input`为文献笔记(md文件)的文件夹路径,`output`为要保存PDF的文件夹路径。 150 | 151 | ```bash 152 | # Update all md files in the input folder 153 | # 更新input文件夹下所有md文件 154 | easyliter -i input -o output 155 | 156 | # Only update the input/example.md file 157 | # 仅更新input/example.md文件 158 | easyliter -i input/example.md -o output 159 | 160 | # -d is an optional flag, when -i is a folder path, using -d will delete unrelated pdf files in the PDF folder from the literature notes content 161 | # -d 是个可选项,当 -i 是文件夹路径时候,使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件 162 | easyliter -i input -o output -d 163 | ``` 164 | 165 | ### 5.2. Migrating Notes and PDF Files(笔记和pdf文件的迁移) 166 | When you need to move the literature notes or the PDF folder, the links to the PDFs in the literature notes might become unusable. You can use `-m` to re-link the PDF files with the literature notes. 167 | 168 | 当要移动文献笔记或者PDF文件夹的时候,文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。 169 | 170 | ```bash 171 | # Update all md files in the input folder 172 | # 更新input文件夹下所有md文件 173 | easyliter -i input -m movedPDFs/ 174 | 175 | # Only update the input/example.md file 176 | # 仅更新input/example.md文件 177 | easyliter -i input/example.md -m movedPDFs/ 178 | ``` 179 | 180 | ## 6. Note (注意事项) 181 | 182 | 1. For users from China mainland, the Google Scholar feature may need a VPN to get it work (the citation function is based on the Google Scholar). If you don't have a VPN, some features may be lost. 183 | 184 | - 对于来自中国大陆的用户,Google Scholar相关功能可能需要 VPN 才能正常工作(引用功能基于 Google scholar)。如果没有挂VPN,某些功能可能会丢失,但不完全影响使用。 185 | 186 | 2. If your Google Scholar is not working (usually caused by too frequent requests of the Google Scholar API), try to set a proxy for it. Check out the help for `-p` and `-gp` options using `easyliter -h`. See more at the 'Using proxies' section of https://scholarly.readthedocs.io/en/stable/quickstart.html. 187 | 188 | - 如果Google Scholar 无法使用(通常由于对Google Scholar API的访问过于频繁),尝试为其设置代理。使用 easyliter -h 查看 -p 和 -gp 选项的帮助信息来设置代理。详见 https://scholarly.readthedocs.io/en/stable/quickstart.html 的 Using proxies部分。 189 | -------------------------------------------------------------------------------- /easyliter.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.py 4 | easy_literature/DBLP.py 5 | easy_literature/GoogleScholar.py 6 | easy_literature/Scholarly.py 7 | easy_literature/__init__.py 8 | easy_literature/arxiv.py 9 | easy_literature/crossref.py 10 | easy_literature/dblp_source.py 11 | easy_literature/downloads.py 12 | easy_literature/easyliter.py 13 | easy_literature/medbiorxiv.py 14 | easy_literature/pdfs.py 15 | easy_literature/utils.py 16 | easyliter.egg-info/PKG-INFO 17 | easyliter.egg-info/SOURCES.txt 18 | easyliter.egg-info/dependency_links.txt 19 | easyliter.egg-info/entry_points.txt 20 | easyliter.egg-info/requires.txt 21 | easyliter.egg-info/top_level.txt -------------------------------------------------------------------------------- /easyliter.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /easyliter.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | easyliter = easy_literature.easyliter:main 3 | -------------------------------------------------------------------------------- /easyliter.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.1 2 | feedparser>=6.0.10 3 | urllib3>=1.26.11 4 | requests>=2.28.1 5 | tqdm>=4.64.0 6 | Unidecode>=1.3.4 7 | bibtexparser==1.4.0 8 | pandas 9 | scholarly 10 | -------------------------------------------------------------------------------- /easyliter.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | easy_literature 2 | -------------------------------------------------------------------------------- /figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/figures/.DS_Store -------------------------------------------------------------------------------- /figures/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/figures/demo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.1 2 | feedparser>=6.0.10 3 | urllib3>=1.26.11 4 | requests>=2.28.1 5 | tqdm>=4.64.0 6 | Unidecode>=1.3.4 7 | pandas 8 | scholarly 9 | bibtexparser==1.4.0 10 | socksio -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md', 'r', encoding='UTF-8') as f: 4 | README_MD = f.read() 5 | 6 | setup( 7 | name="easyliter", 8 | version="1.0.5", 9 | description="EasyLiterature is a opensourced, Python-based command line tool for automatic literature management. Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically collect and refine its information in the markdown file, download the pdf to your local machine, and link the pdf to your paper in the markdown file. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver.", 10 | long_description=README_MD, 11 | long_description_content_type='text/markdown', 12 | url="https://github.com/Psycoy/EasyLiterature", 13 | classifiers=[ 14 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 15 | "Intended Audience :: Science/Research", 16 | "Programming Language :: Python :: 3", 17 | "Topic :: Text Processing :: Markup", 18 | ], 19 | install_requires=["beautifulsoup4>=4.11.1", "feedparser>=6.0.10", 20 | "urllib3>=1.26.11","requests>=2.28.1", 21 | "tqdm>=4.64.0", "Unidecode>=1.3.4", "bibtexparser==1.4.0", "pandas", "scholarly"], 22 | entry_points={ 23 | "console_scripts": [ 24 | "easyliter = easy_literature.easyliter:main", 25 | ] 26 | }, 27 | packages=find_packages(), 28 | license="AGPLv3", 29 | author="Oliver", 30 | author_email="jinjieni@outlook.com", 31 | keywords=["title", "bibtex", "arxiv", "doi", "science", "scientific-journals"], 32 | ) 33 | --------------------------------------------------------------------------------