├── .DS_Store
├── .gitignore
├── LICENSE
├── README.md
├── build
├── .DS_Store
└── lib
│ └── easy_literature
│ ├── DBLP.py
│ ├── GoogleScholar.py
│ ├── Scholarly.py
│ ├── __init__.py
│ ├── arxiv.py
│ ├── crossref.py
│ ├── dblp_source.py
│ ├── dlbp.py
│ ├── downloads.py
│ ├── easyliter.py
│ ├── medbiorxiv.py
│ ├── pdfs.py
│ └── utils.py
├── easy_literature
├── DBLP.py
├── GoogleScholar.py
├── Scholarly.py
├── __init__.py
├── arxiv.py
├── crossref.py
├── dblp_source.py
├── downloads.py
├── easyliter.py
├── medbiorxiv.py
├── pdfs.py
└── utils.py
├── easyliter.egg-info
├── PKG-INFO
├── SOURCES.txt
├── dependency_links.txt
├── entry_points.txt
├── requires.txt
└── top_level.txt
├── figures
├── .DS_Store
└── demo.png
├── requirements.txt
└── setup.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Jinjie Ni
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # EasyLiterature
2 | **EasyLiterature** is a Python-based command line tool for automatic literature management. Welcome star or contribute!
3 |
4 | Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically `collect and refine its information in the markdown file`, `download the pdf to your local machine`, and `link the pdf to your paper in the markdown file`. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver.
5 |
6 |
7 |
8 | **A demo of the entries in your markdown note:**
9 |
10 |
11 |
12 |
13 |
14 | Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4), adapted from [autoLiterature](https://github.com/wilmerwang/autoLiterature).
15 | Compared to autoLiterature, **EasyLiterature** is much easier to use and supports a wider range of features, such as `title-based paper match`, `paper search and download on Google Scholar and DBLP` (the two main sites for scholars), `citation statistics`, `mannual information update assitant`, etc. **EasyLiterature covers almost all papers thanks to the support of Google Scholar and DBLP!**
16 |
17 | ___
18 |
19 | **中文版介绍:**
20 |
21 | **EasyLiterature** 是一个基于python的命令行文件管理工具,永久开源,欢迎star或contribute。
22 |
23 | 之前沐神(李沐)做过一期视频讲如何阅读文献和整理,我觉得讲得非常好,[链接](https://www.bilibili.com/video/BV1nA41157y4)。EasyLiterature基本基于沐神所述的这一流程实现,并丰富了其他功能。
24 |
25 | 简单来说,在 Markdown 文件中简单列出想要阅读的论文标题(或ID),它会自动收集并在Markdown文件中完善相关信息,下载论文的PDF到本地机器,并将PDF链接到Markdown文件中的论文。通过这样的流程,我们可以实现永久保存实时编辑的论文PDF和Markdown中的笔记,无论是在本地机器还是云端,并且方便论文一站式分类和管理。
26 |
27 |
28 |
29 | **markdown文件中的论文信息条目(示意):**
30 |
31 |
32 |
33 |
34 |
35 | 与之前的实现相比,EasyLiterature兼容之前实现的所有功能,并且支持更多功能,比如:1. 基于标题的论文匹配;2. Google Scholar和DBLP(全球两大主要paper数据库)的论文搜索和下载;3. 引用统计;4. 手动信息更新助手;5. 容错搜索匹配;等等。之前的实现由于数据库的限制,很多文章都找不到。**EasyLiterature得益于增加了Google Scholar和DBLP的支持,几乎覆盖了所有论文!**
36 |
37 |
38 |
39 | ## 1. A Simple Usage Example (一个简单的使用示例)
40 | 1. Have the python installed on your local machine (preferably >= 3.7).
41 | 2. Run `pip install easyliter` in your command line to install.
42 | 3. Prepare your markdown note file (e.g., `Note.md`).
**Attention:** You may need to download a markdown editor to create/edit this file. I am using [Typora](https://typora.io/), which is not totally free. You can also choose other alternatives.
43 | 4. List the formated papers titles in your markdown note file according to the Section 4 below (Recognition Rules). e.g.,
44 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
45 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
46 | **(pay attention to the space after ‘\-’)**
47 | 5. Create a folder to store the downloaded pdfs (e.g., `PDFs/`).
48 | 6. Run `easyliter -i -o `.
49 |
(Replace `` with the actual path to your markdown note file, `` with the actual path to your pdf folder)
50 |
e.g., `easyliter -i "/home/Note.md" -o "/home/PDFs"`
51 | 7. Your should able to see that the updated information and downloaded pdf files if no error is reported.
52 | 8. This is a simple and common use case. For other features, please read the below sections carefully and follow the instructions.
53 |
54 |
55 |
56 | **中文版示例**
57 |
58 | 1. 在您的本地机器上安装 Python(版本 >= 3.7)。
59 | 2. 在命令行中运行 `pip install easyliter` 进行安装。
60 | 3. 准备您的 markdown 笔记文件(例如,`Note.md`)。
**注意**: 您需要下载一个 markdown 编辑器来创建/编辑此文件。我使用的是[Typora](https://typora.io/),它不是完全免费的。您也可以选择其他替代产品。
61 | 4. 根据下面第4节(识别规则)在您的 markdown 笔记文件中列出格式化的论文标题。例如:
62 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
63 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
64 | **(注意‘\-’后的空格)**
65 | 5. 创建一个文件夹来存储下载的 pdf 文件(例如,`PDFs/`)。
66 | 6. 运行 `easyliter -i <您的 md 文件路径> -o <您的 pdf 文件夹路径>`。
67 |
**注意**:将 `<您的 md 文件路径>` 替换为您 markdown 笔记文件的实际路径,将 `<您的 pdf 文件夹路径>` 替换为您 pdf 文件夹的实际路径。
68 |
例如:`easyliter -i "/home/Note.md" -o "/home/PDFs"`
69 | 7. 如果没有报错,您应该能够看到更新的信息和下载的 pdf 文件。
70 | 8. 这是一个简单、常用的使用案例。有关其他功能或使用情形,请仔细阅读以下部分并按照说明操作。
71 |
72 | ## 2. Install (安装)
73 | ### pip install
74 | ```bash
75 | pip install easyliter
76 | or
77 | pip3 install easyliter
78 | ```
79 |
80 | ### install from source(to get the up-to-date version)
81 | ```bash
82 | git clone https://github.com/Psycoy/EasyLiterature.git
83 | cd EasyLiterature
84 | pip install -e .
85 | ```
86 |
87 | ## 3. Arguments(使用参数)
88 | ```bash
89 | easyliter
90 |
91 | optional arguments:
92 |
93 | -h, --help show this help message and exit
94 |
95 | -i INPUT, --input INPUT
96 | The path to the note file or note file folder.
97 |
98 | -o OUTPUT, --output OUTPUT
99 | Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.
100 |
101 | -p PROXY, --proxy PROXY
102 | The proxy. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.
103 |
104 | -gp GPROXY_MODE, --gproxy_mode GPROXY_MODE
105 | The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).
106 |
107 | -d, --delete
108 | Delete unreferenced attachments in notes. Use with caution, when used, -i must be a folder path including all notes.
109 |
110 | -m MIGRATION, --migration MIGRATION
111 | The pdf folder path you want to reconnect to.
112 | ```
113 |
114 |
115 | ## 4. Recognition Rules (识别规则):
116 | - If the notes file contains `- {paper_id}`, it will download the information of that literature, but not the PDF.
117 | - If the notes file contains `- {{paper_id}}`, it will download both the information of that literature and the PDF.
118 |
119 | - Note: `paper_id` supports `article title`, published articles' `doi`, and pre-published articles' `arvix_id`, `biorvix_id`, and `medrvix_id`. It will try all the possible sources online.
120 |
121 | ___
122 |
123 | - 当笔记文件中包含 `- {paper_id}`时候,会下载该文献的信息,不下载PDF。
124 | - 当笔记文件中包含 `- {{paper_id}}`时候,会下载该文献的信息,以及PDF。
125 |
126 | - 注意:`paper_id` 支持`文章标题`,已发表文章的`doi`, 预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。EasyLiterature会从多个数据库自动识别需要收集和下载的论文,几乎覆盖所有目前存在的论文。
127 |
128 |
129 | ## 5. Usage(使用)
130 | ### 5.1. Basic Usage(基本使用)
131 | Assuming `input` is the folder path of the literature notes (.md files) and `output` is the folder path where you want to save the PDFs.
132 |
133 | 假设`input`为文献笔记(md文件)的文件夹路径,`output`为要保存PDF的文件夹路径。
134 |
135 | ```bash
136 | # Update all md files in the input folder
137 | # 更新input文件夹下所有md文件
138 | easyliter -i input -o output
139 |
140 | # Only update the input/example.md file
141 | # 仅更新input/example.md文件
142 | easyliter -i input/example.md -o output
143 |
144 | # -d is an optional flag, when -i is a folder path, using -d will delete unrelated pdf files in the PDF folder from the literature notes content
145 | # -d 是个可选项,当 -i 是文件夹路径时候,使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件
146 | easyliter -i input -o output -d
147 | ```
148 |
149 | ### 5.2. Migrating Notes and PDF Files(笔记和pdf文件的迁移)
150 | When you need to move the literature notes or the PDF folder, the links to the PDFs in the literature notes might become unusable. You can use `-m` to re-link the PDF files with the literature notes.
151 |
152 | 当要移动文献笔记或者PDF文件夹的时候,文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。
153 |
154 | ```bash
155 | # Update all md files in the input folder
156 | # 更新input文件夹下所有md文件
157 | easyliter -i input -m movedPDFs/
158 |
159 | # Only update the input/example.md file
160 | # 仅更新input/example.md文件
161 | easyliter -i input/example.md -m movedPDFs/
162 | ```
163 |
164 | ## 6. Note (注意事项)
165 |
166 | 1. For users from China mainland, the Google Scholar feature may need a VPN to get it work (the citation function is based on the Google Scholar). If you don't have a VPN, some features may be lost.
167 |
168 | - 对于来自中国大陆的用户,Google Scholar相关功能可能需要 VPN 才能正常工作(引用功能基于 Google scholar)。如果没有挂VPN,某些功能可能会丢失,但不完全影响使用。
169 |
170 | 2. If your Google Scholar is not working (usually caused by too frequent requests of the Google Scholar API), try to set a proxy for it. Check out the help for `-p` and `-gp` options using `easyliter -h`. See more at the 'Using proxies' section of https://scholarly.readthedocs.io/en/stable/quickstart.html.
171 |
172 | - 如果Google Scholar 无法使用(通常由于对Google Scholar API的访问过于频繁),尝试为其设置代理。使用 easyliter -h 查看 -p 和 -gp 选项的帮助信息来设置代理。详见 https://scholarly.readthedocs.io/en/stable/quickstart.html 的 Using proxies部分。
173 |
--------------------------------------------------------------------------------
/build/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/build/.DS_Store
--------------------------------------------------------------------------------
/build/lib/easy_literature/DBLP.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from urllib.request import ProxyHandler
3 | from . import dblp_source as dblp
4 | import pandas as pd
5 |
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('DBLP')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class DBLPInfo(object):
13 |
14 | def set_proxy(self, proxy_address = None):
15 | """set proxy handler
16 |
17 | Aargs:
18 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
19 |
20 | Returns:
21 | A proxy handler object.
22 | """
23 | pass
24 |
25 |
26 | def extract_json_info(self, item):
27 | """Extract bib json information from requests.get().json()
28 |
29 | Args:
30 | item (json object): obtained by requests.get().json()
31 |
32 | Returns:
33 | A dict containing the paper information.
34 | """
35 | trial_num = 0
36 | while trial_num<10:
37 | trial_num+=1
38 | try:
39 | results = dblp.search([item])
40 | break
41 | except:
42 | if trial_num == 10:
43 | results = pd.DataFrame({'A' : []})
44 | else:
45 | pass
46 |
47 |
48 |
49 | if not results.empty:
50 | if 'CoRR' in [str(venue) for venue in results['Where']]:
51 | journal = 'CoRR'
52 | for venue in results['Where']:
53 | if str(venue) != 'CoRR':
54 | journal = str(venue)
55 | break
56 |
57 | str(results['Where'])
58 | bib_dict = {
59 | "title": str(results['Title'][0]),
60 | "author": ' and '.join([str(Entry) for Entry in results['Authors'][0]]),
61 | "journal": journal,
62 | "year": str(results['Year'][0]),
63 | "url": str(results['Link'][0]),
64 | "pdf_link": None,
65 | "cited_count": None
66 | }
67 | else:
68 | bib_dict = None
69 | return bib_dict
70 |
71 |
72 | def get_info_by_title(self, title):
73 | """Get the meta information by the given paper title.
74 |
75 | Args:
76 | doi (str): The paper title
77 |
78 | Returns:
79 | A dict containing the paper information.
80 | {
81 | "title": xxx,
82 | "author": xxx,
83 | "journal": xxx,
84 | etc
85 | }
86 | OR
87 | None
88 | OR
89 | A list [{}, {}, {}]
90 | """
91 | return self.extract_json_info(title)
92 |
93 |
94 | if __name__ == "__main__":
95 | # arxivId = "2208.05623"
96 | # title = "Heterogeneous Graph Attention Network"
97 |
98 | # gscholar_info = GscholarInfo()
99 | # gscholar_info.set_proxy(proxy_name='single')
100 |
101 | # bib_arxiv = gscholar_info.get_info_by_title(title)
102 | # # bib_title = arxiv_info.get_info_by_title(title)
103 |
104 | # print(bib_arxiv)
105 | # print("\n")
106 | # # print(bib_title)
107 | results = dblp.search(["Finetunedlanguage models are zero-shot learners"])
108 |
109 | print(results)
--------------------------------------------------------------------------------
/build/lib/easy_literature/GoogleScholar.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from scholarly import scholarly, ProxyGenerator
3 |
4 |
5 | logging.basicConfig()
6 | logger = logging.getLogger('GoogleScholar')
7 | logger.setLevel(logging.DEBUG)
8 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
9 |
10 | class GscholarInfo(object):
11 |
12 | def set_proxy(self, proxy_name = "free", proxy_address = None):
13 | """set proxy handler
14 |
15 | Aargs:
16 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
17 |
18 | Returns:
19 | A proxy handler object.
20 | """
21 | # TODO find a better proxy strategy
22 | if proxy_address:
23 | sucess = False
24 | pg = ProxyGenerator()
25 | if proxy_name == "free":
26 | sucess = pg.FreeProxies()
27 | elif proxy_name == "single":
28 | sucess = pg.SingleProxy(http = proxy_address, https = proxy_address)
29 | elif proxy_name == "Scraper":
30 | sucess = pg.ScraperAPI('a44bd5be9f56b1be9d6e40116ea4b440')
31 | logger.info(f'Scholarly using {proxy_name} proxy.')
32 | logger.info(f'Proxy setup sucess: {sucess}.')
33 | scholarly.use_proxy(pg)
34 |
35 |
36 | def extract_json_info(self, item):
37 | """Extract bib json information from requests.get().json()
38 |
39 | Args:
40 | item (json object): obtained by requests.get().json()
41 |
42 | Returns:
43 | A dict containing the paper information.
44 | """
45 | bib_dict = None
46 | trial_num = 0
47 |
48 | while trial_num<9:
49 | try:
50 | trial_num+=1
51 | pubs_iter = scholarly.search_pubs(item)
52 | dictinfo = next(pubs_iter)
53 | # logger.info(dictinfo)
54 | bib_dict = {
55 | "title": dictinfo['bib']['title'].replace('\n', ''),
56 | "author": ' and '.join(dictinfo['bib']['author']),
57 | "journal": dictinfo['bib']['venue'],
58 | "year": dictinfo['bib']['pub_year'],
59 | "url": dictinfo['pub_url'],
60 | "pdf_link": dictinfo['eprint_url'],
61 | "cited_count": dictinfo['num_citations']
62 | }
63 | break
64 | except:
65 | pass
66 |
67 | return bib_dict
68 |
69 |
70 |
71 | def get_info_by_title(self, title):
72 | """Get the meta information by the given paper title.
73 |
74 | Args:
75 | doi (str): The paper title
76 |
77 | Returns:
78 | A dict containing the paper information.
79 | {
80 | "title": xxx,
81 | "author": xxx,
82 | "journal": xxx,
83 | etc
84 | }
85 | OR
86 | None
87 | OR
88 | A list [{}, {}, {}]
89 | """
90 | return self.extract_json_info(title)
91 |
92 |
93 | if __name__ == "__main__":
94 | arxivId = "2208.05623"
95 | title = "Heterogeneous Graph Attention Network"
96 |
97 | gscholar_info = GscholarInfo()
98 | gscholar_info.set_proxy(proxy_name='free')
99 |
100 | bib_arxiv = gscholar_info.get_info_by_title(title)
101 | # bib_title = arxiv_info.get_info_by_title(title)
102 |
103 | print(bib_arxiv)
104 | print("\n")
105 | # print(bib_title)
--------------------------------------------------------------------------------
/build/lib/easy_literature/Scholarly.py:
--------------------------------------------------------------------------------
1 | import json
2 | from scholarly import scholarly
3 | from scholarly import ProxyGenerator
4 |
5 | # Set up a ProxyGenerator object to use free proxies
6 | # This needs to be done only once per session
7 | pg = ProxyGenerator()
8 |
9 | sucess = pg.FreeProxies()
10 | # print(f'Proxy setup sucess: {sucess}.')
11 | scholarly.use_proxy(pg)
12 |
13 | # will paginate to the next page by default
14 | pubs_iter = scholarly.search_pubs("1810.04805")
15 |
16 |
17 | print(json.dumps(next(pubs_iter), indent=2))
18 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/build/lib/easy_literature/__init__.py
--------------------------------------------------------------------------------
/build/lib/easy_literature/arxiv.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from urllib.request import ProxyHandler
3 | import feedparser
4 | try:
5 | from urllib import quote
6 | except ImportError:
7 | from urllib.parse import quote
8 | from unidecode import unidecode
9 |
10 | from .crossref import crossrefInfo
11 |
12 |
13 | logging.basicConfig()
14 | logger = logging.getLogger('arxiv')
15 | logger.setLevel(logging.DEBUG)
16 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
17 |
18 | class arxivInfo(object):
19 | def __init__(self):
20 | self.base_url = "http://export.arxiv.org/api/query"
21 |
22 | def set_proxy_handler(self, proxy):
23 | """set proxy handler
24 |
25 | Aargs:
26 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
27 |
28 | Returns:
29 | A proxy handler object.
30 | """
31 | proxy_handler = ProxyHandler({"http": f"http://{proxy}",
32 | "https": f"https://{proxy}"})
33 | return proxy_handler
34 |
35 |
36 | def extract_json_info(self, item):
37 | """Extract bib json information from requests.get().json()
38 |
39 | Args:
40 | item (json object): obtained by requests.get().json()
41 |
42 | Returns:
43 | A dict containing the paper information.
44 | """
45 | paper_url = item.link
46 | title = item.title
47 | journal = "arxiv"
48 | published = item.published.split("-")
49 | if len(published) > 1:
50 | year = published[0]
51 | else:
52 | year = ' '
53 |
54 | authors = item.authors
55 | if len(authors) > 0:
56 | first_author = authors[0]["name"].split(" ")
57 | authors = " and ".join([author["name"] for author in authors])
58 | else:
59 | first_author = authors
60 | authors = authors
61 |
62 | bib_dict = {
63 | "title": title,
64 | "author": authors,
65 | "journal": journal,
66 | "year": year,
67 | "url": paper_url,
68 | "pdf_link": item.link.replace("abs", "pdf")+".pdf",
69 | "cited_count": None
70 | }
71 |
72 | return bib_dict
73 |
74 |
75 | def get_info_by_arxivid(self, arxivId, handler=False):
76 | """Get the meta information by the given paper arxiv_id.
77 |
78 | Args:
79 | doi (str): The arxiv Id
80 | handler (handler object): use proxy
81 |
82 | Returns:
83 | A dict containing the paper information.
84 | {
85 | "title": xxx,
86 | "author": xxx,
87 | "journal": xxx,
88 | etc
89 | }
90 | OR
91 | None
92 | """
93 |
94 | params = "?search_query=id:"+quote(unidecode(arxivId))
95 |
96 | try:
97 | if handler:
98 | result = feedparser.parse(self.base_url + params, handlers=[handler])
99 | else:
100 | result = feedparser.parse(self.base_url + params)
101 | items = result.entries
102 |
103 | item = items[0]
104 | if "arxiv_doi" in item:
105 | doi = item["arxiv_doi"]
106 |
107 | crossref_info = crossrefInfo()
108 | if handler:
109 | crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1])
110 | return crossref_info.get_info_by_doi(doi)
111 | else:
112 | return self.extract_json_info(item)
113 | except:
114 | logger.error("DOI: {} is error.".format(arxivId))
115 |
116 |
117 | def get_info_by_title(self, title, field='ti'):
118 | """Get the meta information by the given paper title.
119 |
120 | Args:
121 | doi (str): The paper title
122 |
123 | Returns:
124 | A dict containing the paper information.
125 | {
126 | "title": xxx,
127 | "author": xxx,
128 | "journal": xxx,
129 | etc
130 | }
131 | OR
132 | None
133 | OR
134 | A list [{}, {}, {}]
135 | """
136 | params = "?search_query="+field+":"+quote(unidecode(title))
137 | url = self.base_url + params
138 | try:
139 | result = feedparser.parse(url)
140 | items = result.entries
141 | print(len(items))
142 |
143 | for i, item in enumerate(items):
144 |
145 | title_item = item.title
146 | try:
147 | title_item = title_item.decode("utf-8")
148 | except:
149 | pass
150 |
151 | item.title = title_item
152 |
153 | if title_item.lower() == title.lower():
154 | return self.extract_json_info(item)
155 |
156 | items[i] = item
157 |
158 | return [self.extract_json_info(it) for it in items]
159 | except:
160 | logger.error("Title: {} is error.".format(title))
161 |
162 |
163 | if __name__ == "__main__":
164 | arxivId = "2208.05623"
165 | title = "Heterogeneous Graph Attention Network"
166 |
167 | arxiv_info = arxivInfo()
168 | arxiv_info.set_proxy_handler(proxy="127.0.1:1123")
169 |
170 | bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId)
171 |
172 | print(bib_arxiv)
173 | print("\n")
--------------------------------------------------------------------------------
/build/lib/easy_literature/crossref.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | #
4 | # 1. get info by doi
5 | # 2. get info by title
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('crossref')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class crossrefInfo(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 | self.base_url = "http://api.crossref.org/"
17 |
18 | def set_proxy(self, proxy=None):
19 | """set proxy for session
20 |
21 | Args:
22 | proxy (str): The proxy adress. e.g 127.0.1:1123
23 | Returns:
24 | None
25 | """
26 | if proxy:
27 | self.sess.proxies = {
28 | "http": proxy,
29 | "https": proxy, }
30 |
31 |
32 | def extract_json_info(self, bib):
33 | """Extract bib json information from requests.get().json()
34 |
35 | Args:
36 | bib (json object): obtained by requests.get().json()
37 |
38 | Returns:
39 | A dict containing the paper information.
40 | """
41 | pub_date = [str(i) for i in bib['published']["date-parts"][0]]
42 | pub_date = '-'.join(pub_date)
43 |
44 | if 'author' in bib.keys():
45 | authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()])
46 | else:
47 | authors = "No author"
48 |
49 | if 'short-container-title' in bib.keys():
50 | try:
51 | journal = bib['short-container-title'][0]
52 | except:
53 | journal = "No journal"
54 | else:
55 | try:
56 | journal = bib['container-title'][0]
57 | except:
58 | journal = "No journal"
59 |
60 | bib_dict = {
61 | "title": bib['title'][0],
62 | "author": authors,
63 | "journal": journal,
64 | "year": pub_date,
65 | "url": bib["URL"],
66 | "pdf_link": bib["link"][0]["URL"],
67 | "cited_count": bib["is-referenced-by-count"]
68 | }
69 |
70 | return bib_dict
71 |
72 |
73 | def get_info_by_doi(self, doi):
74 | """Get the meta information by the given paper DOI number.
75 |
76 | Args:
77 | doi (str): The paper DOI number
78 |
79 | Returns:
80 | A dict containing the paper information.
81 | {
82 | "title": xxx,
83 | "author": xxx,
84 | "journal": xxx,
85 | etc
86 | }
87 | OR
88 | None
89 | """
90 | url = "{}works/{}"
91 | url = url.format(self.base_url, doi)
92 |
93 | try:
94 | r = self.sess.get(url)
95 |
96 | bib = r.json()['message']
97 | return self.extract_json_info(bib)
98 |
99 | except:
100 | logger.error("DOI: {} is error.".format(doi))
101 |
102 |
103 | def get_info_by_title(self, title):
104 | """Get the meta information by the given paper title.
105 |
106 | Args:
107 | doi (str): The paper title
108 |
109 | Returns:
110 | A dict containing the paper information.
111 | {
112 | "title": xxx,
113 | "author": xxx,
114 | "journal": xxx,
115 | etc
116 | }
117 | OR
118 | None
119 | OR
120 | A list [{}, {}, {}]
121 | """
122 | url = self.base_url + "works"
123 | params = {"query.bibliographic": title, "rows": 20}
124 | try:
125 | r = self.sess.get(url, params=params)
126 | items = r.json()["message"]["items"]
127 |
128 | for i, item in enumerate(items):
129 |
130 | title_item = item['title'][0]
131 | try:
132 | title_item = title_item.decode("utf-8")
133 | except:
134 | pass
135 |
136 | item["title"][0] = title_item
137 |
138 | if title_item.lower() == title.lower():
139 | return self.extract_json_info(item)
140 |
141 | items[i] = item
142 |
143 | return [self.extract_json_info(it) for it in items]
144 | except:
145 | logger.error("Title: {} is error.".format(title))
146 |
147 |
148 | if __name__ == "__main__":
149 | # doi = "10.1016/j.wneu.2012.11.074"
150 | # doi = "10.1093/cercor/bhac266"
151 | doi = "10.1038/s41467-022-29269-6"
152 | # title = "Heterogeneous Graph Attention Network"
153 | # title = "Learning to Copy Coherent Knowledge for Response Generation"
154 |
155 | crossref_info = crossrefInfo()
156 | crossref_info.set_proxy(proxy="127.0.1:1123")
157 |
158 | bib_doi = crossref_info.get_info_by_doi(doi)
159 | # bib_title = crossref_info.get_info_by_title(title)
160 |
161 | print(bib_doi)
162 | print("\n")
163 | # print(bib_title)
164 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/dblp_source.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | #options
6 | STRINGS_FOR_TEST = ["Collaborative Writing"]
7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
9 |
10 |
11 | def query_db(pub_string=STRINGS_FOR_TEST):
12 | '''
13 | returns the BeautifulSoup object of a query to DBLP
14 |
15 | :param pub_string: A list of strings of keywords
16 | :return: BeautifulSoup: A BeautifulSoup Object
17 | '''
18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
19 | return BeautifulSoup(resp.content)
20 |
21 | def get_pub_data(pub):
22 | '''
23 | Extracts the information about a publication from a BeautifulSoup object
24 |
25 | :param pub: A BeautifulSoup Object with Publication Information
26 | :return: dict: All Information of this Publication
27 | '''
28 | ptype = 'nothing'
29 | link = 'nothing'
30 | authors = []
31 | title = 'nothing'
32 | where = 'nothing'
33 |
34 | if 'year' in pub.get('class'):
35 | # year is not always scrapable, except for this case. Might be done more elegantly
36 | return int(pub.contents[0])
37 | else:
38 | ptype = pub.attrs.get('class')[1]
39 | for content_item in pub.contents:
40 | class_of_content_item = content_item.attrs.get('class', [0])
41 | if 'data' in class_of_content_item:
42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}):
43 | authors.append(author.text)
44 | title = content_item.find('span', attrs={"class": "title"}).text
45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
46 | found_where = where_data.find('span', attrs={"itemprop": "name"})
47 | if found_where:
48 | where = found_where.text
49 | elif 'publ' in class_of_content_item:
50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing")
51 |
52 | return {'Type': ptype,
53 | 'Link': link,
54 | 'Authors': authors,
55 | 'Title': title,
56 | 'Where': where}
57 |
58 | def search(search_string=STRINGS_FOR_TEST):
59 | '''
60 | returns the information found in a search query to dblp as a pandas dataframe.
61 | Shows the following information:
62 | - Authors
63 | - Link to Publication
64 | - Title
65 | - Type (Article, Proceedings etc.)
66 | - Where it was published
67 | - Year of publication
68 | :param search_string: A List of Strings of Keywords, that should be searched for
69 | :return: pd.DataFrame: A Dataframe with all data
70 | '''
71 | soup = query_db(search_string)
72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
73 |
74 | pub_list_data = []
75 | curr_year = 0
76 | for child in pub_list_raw.children:
77 | pub_data = get_pub_data(child)
78 | if type(pub_data) == int:
79 | curr_year = pub_data
80 | else:
81 | pub_data['Year'] = curr_year
82 | pub_list_data.append(pub_data)
83 |
84 | return pd.DataFrame(pub_list_data)
85 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/dlbp.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | #options
6 | STRINGS_FOR_TEST = ["Collaborative Writing"]
7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
9 |
10 |
11 | def query_db(pub_string=STRINGS_FOR_TEST):
12 | '''
13 | returns the BeautifulSoup object of a query to DBLP
14 |
15 | :param pub_string: A list of strings of keywords
16 | :return: BeautifulSoup: A BeautifulSoup Object
17 | '''
18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
19 | return BeautifulSoup(resp.content)
20 |
21 | def get_pub_data(pub):
22 | '''
23 | Extracts the information about a publication from a BeautifulSoup object
24 |
25 | :param pub: A BeautifulSoup Object with Publication Information
26 | :return: dict: All Information of this Publication
27 | '''
28 | ptype = 'nothing'
29 | link = 'nothing'
30 | authors = []
31 | title = 'nothing'
32 | where = 'nothing'
33 |
34 | if 'year' in pub.get('class'):
35 | # year is not always scrapable, except for this case. Might be done more elegantly
36 | return int(pub.contents[0])
37 | else:
38 | ptype = pub.attrs.get('class')[1]
39 | for content_item in pub.contents:
40 | class_of_content_item = content_item.attrs.get('class', [0])
41 | if 'data' in class_of_content_item:
42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}):
43 | authors.append(author.text)
44 | title = content_item.find('span', attrs={"class": "title"}).text
45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
46 | found_where = where_data.find('span', attrs={"itemprop": "name"})
47 | if found_where:
48 | where = found_where.text
49 | elif 'publ' in class_of_content_item:
50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing")
51 |
52 | return {'Type': ptype,
53 | 'Link': link,
54 | 'Authors': authors,
55 | 'Title': title,
56 | 'Where': where}
57 |
58 | def search(search_string=STRINGS_FOR_TEST):
59 | '''
60 | returns the information found in a search query to dblp as a pandas dataframe.
61 | Shows the following information:
62 | - Authors
63 | - Link to Publication
64 | - Title
65 | - Type (Article, Proceedings etc.)
66 | - Where it was published
67 | - Year of publication
68 | :param search_string: A List of Strings of Keywords, that should be searched for
69 | :return: pd.DataFrame: A Dataframe with all data
70 | '''
71 | soup = query_db(search_string)
72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
73 |
74 | pub_list_data = []
75 | curr_year = 0
76 | for child in pub_list_raw.children:
77 | pub_data = get_pub_data(child)
78 | if type(pub_data) == int:
79 | curr_year = pub_data
80 | else:
81 | pub_data['Year'] = curr_year
82 | pub_list_data.append(pub_data)
83 |
84 | return pd.DataFrame(pub_list_data)
85 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/downloads.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | import os
4 | import platform
5 |
6 | from .arxiv import arxivInfo
7 | from .crossref import crossrefInfo
8 | from .medbiorxiv import BMxivInfo
9 | from .GoogleScholar import GscholarInfo
10 | from .DBLP import DBLPInfo
11 | from .pdfs import pdfDownload
12 |
13 | # log config
14 | logging.basicConfig()
15 | logger = logging.getLogger('Downloads')
16 | logger.setLevel(logging.INFO)
17 |
18 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
19 |
20 |
21 |
22 | def check_string(re_exp, str):
23 | res = re.match(re_exp, str)
24 | if res:
25 | return True
26 | else:
27 | return False
28 |
29 | def classify(identifier):
30 | """
31 | Classify the type of paper_id:
32 | arxivId - arxivId
33 | doi - digital object identifier
34 | medbiorxivId - medrxiv or biorxiv id
35 | title - title
36 | """
37 | if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier):
38 | return 'doi'
39 | elif check_string(r'10\.1101/\.*', identifier):
40 | return "medbiorxivId"
41 | elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier):
42 | return 'arxivId'
43 | elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier):
44 | return 'title'
45 | else:
46 | return "unrecognized"
47 |
48 | def get_paper_info_from_paperid(paper_id, proxy=None, gproxy_mode='free'):
49 | id_type = classify(paper_id)
50 |
51 | if id_type == "doi":
52 | logger.info('ID type: doi.')
53 | downloader = crossrefInfo()
54 | if proxy:
55 | downloader.set_proxy(proxy=proxy)
56 | bib_dict = downloader.get_info_by_doi(paper_id)
57 |
58 | elif id_type == "arxivId":
59 | logger.info('ID type: arixiv.')
60 | downloader = arxivInfo()
61 | if proxy:
62 | downloader.set_proxy_handler(proxy=proxy)
63 | bib_dict = downloader.get_info_by_arxivid(paper_id)
64 |
65 | elif id_type == "medbiorxivId":
66 | logger.info('ID type: medbiorxivId.')
67 | downloader = BMxivInfo()
68 | if proxy:
69 | downloader.set_proxy(proxy=proxy)
70 | bib_dict = downloader.get_info_by_bmrxivid(paper_id)
71 |
72 | elif id_type == "title":
73 | logger.info('ID type: title.')
74 | downloader1 = GscholarInfo()
75 | downloader1.set_proxy(proxy_name=gproxy_mode, proxy_address=proxy)
76 | bib_dict = downloader1.get_info_by_title(paper_id)
77 |
78 | downloader2 = DBLPInfo()
79 | downloader2.set_proxy(proxy_address=proxy)
80 | bib_dict1 = downloader2.get_info_by_title(paper_id)
81 |
82 | logger.info(f'The Google scholar bib: {bib_dict}; The DLBP bib: {bib_dict1}.')
83 |
84 | if bib_dict is not None and bib_dict1 is not None:
85 | bib_dict['journal'] = bib_dict1['journal']
86 | elif bib_dict is None and bib_dict1 is not None:
87 | bib_dict = bib_dict1
88 | elif bib_dict is None and bib_dict1 is None:
89 | logger.info('Title not found on DLBP and Google scholar.')
90 | else:
91 | pass
92 |
93 | try:
94 | return bib_dict
95 | except:
96 | pass
97 |
98 |
99 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None):
100 | pdf_downloader = pdfDownload()
101 | if proxy:
102 | pdf_downloader.set_proxy(proxy=proxy)
103 |
104 | if direct_url:
105 | content = pdf_downloader.get_pdf_from_direct_url(direct_url)
106 | if not content:
107 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
108 | else:
109 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
110 | try:
111 | system = platform.system()
112 | if system == 'Windows':
113 | path = path.replace("/", "\\")
114 | pdf_dir = path.rsplit("\\", 1)[0]
115 | else:
116 | pdf_dir = path.rsplit("/", 1)[0]
117 | if not os.path.exists(pdf_dir):
118 | os.makedirs(pdf_dir)
119 | pdf_downloader._save(content['pdf'], path)
120 | except:
121 | pass
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/easyliter.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import argparse
3 | import os
4 |
5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('easyliter')
9 | logger.setLevel(logging.INFO)
10 |
11 |
12 |
13 |
14 | def set_args():
15 | parser = argparse.ArgumentParser(description='EasyLiterature')
16 | parser.add_argument('-i', '--input', required=True, type=str, default=None,
17 | help="The path to the note file or note file folder.")
18 | parser.add_argument('-o', '--output', type=str, default=None,
19 | help='Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.')
20 | parser.add_argument('-p', '--proxy', type=str, default=None,
21 | help='The proxy address. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.')
22 | parser.add_argument('-gp', '--gproxy_mode', type=str, default='free',
23 | help='The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).')
24 | parser.add_argument('-d', '--delete', action='store_true',
25 | help='Delete unreferenced attachments in notes. Use with caution, '
26 | 'when used, -i must be a folder path including all notes.')
27 | parser.add_argument('-m', '--migration', type=str, default=None,
28 | help="The pdf folder path you want to reconnect to.")
29 | args = parser.parse_args()
30 |
31 | return args
32 |
33 | def check_args():
34 | args = set_args()
35 | input_path = args.input
36 | output_path = args.output
37 | delete_bool = args.delete
38 | migration_path = args.migration
39 | proxy = args.proxy
40 | gproxy_mode = args.gproxy_mode
41 |
42 | return input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode
43 |
44 |
45 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer, gproxy_mode):
46 |
47 | pdfs_path = output_path
48 | if not os.path.exists(pdfs_path):
49 | os.makedirs(pdfs_path)
50 |
51 | with open(note_file, 'r') as f:
52 | content = f.read()
53 |
54 | m = paper_recognizer.findall(content)
55 | logger.info("Number of files to download - {}".format(len(m)))
56 |
57 | if not m:
58 | logger.info("The file {} is not found, or there is no valid entry in the file.".format(note_file))
59 | else:
60 | replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy, gproxy_mode=gproxy_mode)
61 |
62 | return replace_dict
63 |
64 |
65 | def file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode):
66 |
67 | replace_dict = get_bib_and_pdf(input_path, output_path,
68 | proxy, paper_recognizer, gproxy_mode)
69 |
70 | if replace_dict:
71 | note_modified(paper_recognizer, input_path, **replace_dict)
72 |
73 |
74 | def main():
75 | input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode = check_args()
76 |
77 | if output_path:
78 | paper_recognizer = patternRecognizer(r'- \{.{3,}\}')
79 |
80 | if os.path.isfile(input_path):
81 | logger.info("Updating the file {}".format(input_path))
82 | file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode)
83 |
84 | elif os.path.isdir(input_path):
85 | note_paths = []
86 | for root, _, files in os.walk(input_path):
87 | for file in files:
88 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
89 | note_paths.append(os.path.join(root, file))
90 | for note_path in note_paths:
91 | logger.info("Updating the file {}".format(note_path))
92 | file_update(note_path, output_path, proxy, paper_recognizer, gproxy_mode)
93 | else:
94 | logger.info("input path {} does not exist".format(input_path))
95 |
96 |
97 | # Delete unreferenced attachments
98 | if delete_bool:
99 | if os.path.isfile(input_path):
100 | logger.info("To delete the PDF entities unrelated to the notes, the input path must be the main notes folder!!! Please use this parameter with caution!!!")
101 | else:
102 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
103 | pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer)
104 | pdf_paths = get_pdf_paths(output_path)
105 | # TODO the path between mac and win could be different,“/” 和 “\\”
106 | pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes]
107 | pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths]
108 |
109 | removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes))
110 | try:
111 | for pdf_p in removed_pdf_paths:
112 | os.remove(pdf_p)
113 | except:
114 | pass
115 |
116 | logger.info("Deleted {} files".format(len(removed_pdf_paths)))
117 |
118 |
119 | if migration_path:
120 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
121 |
122 | pdf_paths = get_pdf_paths(migration_path)
123 | pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer)
124 |
125 | # match based on paper title
126 | matched_numb = 0
127 | pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths}
128 | for md_file, pdf_paths_ in pdf_paths_in_notes.items():
129 |
130 | pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_}
131 | matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys()
132 |
133 | matched_numb += len(matched_pdfs)
134 |
135 | replace_paths_dict = {}
136 | for matched in matched_pdfs:
137 | replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1]
138 | replaced_str = "[pdf]({})".format(replaced_str)
139 | ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched])
140 | replace_paths_dict[ori_str] = replaced_str
141 |
142 | if replace_paths_dict:
143 | note_modified(pdf_path_recognizer, md_file, **replace_paths_dict)
144 |
145 | logger.info("Found - {} - pdf files".format(matched_numb))
146 |
147 |
148 | if not output_path and not migration_path:
149 | logger.info("lacking the arguments -o or -m, use -h to see the help")
150 |
151 |
152 | if __name__ == "__main__":
153 | main()
--------------------------------------------------------------------------------
/build/lib/easy_literature/medbiorxiv.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 | from .crossref import crossrefInfo
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('biorxiv')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class BMxivInfo(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 | self.base_url = "https://api.biorxiv.org/details/"
17 | self.servers = ["biorxiv", "medrxiv"]
18 |
19 |
20 | def set_proxy(self, proxy=False):
21 | """set proxy for session
22 |
23 | Args:
24 | proxy (str): The proxy adress. e.g 127.0.1:1123
25 | Returns:
26 | None
27 | """
28 | if proxy:
29 | self.sess.proxies = {
30 | "http": proxy,
31 | "https": proxy, }
32 |
33 |
34 | def extract_json_info(self, item):
35 | """Extract bib json information from requests.get().json()
36 |
37 | Args:
38 | item (json object): obtained by requests.get().json()
39 |
40 | Returns:
41 | A dict containing the paper information.
42 | """
43 | paper_url = f"https://www.biorxiv.org/content/{item['doi']}"
44 | title = item["title"]
45 | journal = item["server"]
46 | published = item["date"].split('-')
47 | if len(published) > 1:
48 | year = published[0]
49 | else:
50 | year = ' '
51 |
52 | authors = item['authors'].split("; ")
53 | if len(authors) > 0:
54 | authors = " and ".join([author for author in authors])
55 | else:
56 | authors = authors
57 |
58 | bib_dict = {
59 | "title": title,
60 | "author": authors,
61 | "journal": journal,
62 | "year": year,
63 | "url": paper_url,
64 | "pdf_link": f"{paper_url}.full.pdf",
65 | "cited_count": None
66 | }
67 |
68 | return bib_dict
69 |
70 |
71 | def get_info_by_bmrxivid(self, bmrxivid):
72 | """Get the meta information by the given paper biorxiv_id or medrxiv_id.
73 |
74 | Args:
75 | doi (str): The biorxiv or medrxiv Id
76 |
77 | Returns:
78 | A dict containing the paper information.
79 | {
80 | "title": xxx,
81 | "author": xxx,
82 | "journal": xxx,
83 | etc
84 | }
85 | OR
86 | None
87 | """
88 | urls = [self.base_url + server + "/" + bmrxivid for server in self.servers]
89 | for url in urls:
90 | try:
91 | r = self.sess.get(url)
92 |
93 | bib = r.json()['collection'][-1]
94 |
95 | if "published" in bib.keys() and bib['published'] != "NA":
96 | doi = bib["published"]
97 | print(doi)
98 | crossref_info = crossrefInfo()
99 | if len(self.sess.proxies) > 0:
100 | crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1])
101 | return crossref_info.get_info_by_doi(doi)
102 |
103 | return self.extract_json_info(bib)
104 |
105 | except:
106 | logger.error("DOI: {} is error.".format(bmrxivid))
107 |
108 |
109 | def get_info_by_title(self, title):
110 | """Get the meta information by the given paper title.
111 |
112 | Args:
113 | doi (str): The paper title
114 |
115 | Returns:
116 | A dict containing the paper information.
117 | {
118 | "title": xxx,
119 | "author": xxx,
120 | "journal": xxx,
121 | etc
122 | }
123 | OR
124 | None
125 | OR
126 | A list [{}, {}, {}]
127 | """
128 | base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard"
129 | query = title.replace(' ', '%252B')
130 |
131 | url = base_url.format(query)
132 | try:
133 | result = self.sess.get(url)
134 | soup = BeautifulSoup(result.content, "lxml")
135 | soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix")
136 |
137 | soup_dict = dict()
138 | for sp in soup_items:
139 | key = sp.find("a", class_="highwire-cite-linked-title").span.text
140 | value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "")
141 | soup_dict[key] = value
142 |
143 | for item_title, item_doi in soup_dict.items():
144 | try:
145 | item_title = item_title.decode("utf-8")
146 | except:
147 | pass
148 |
149 | if item_title.lower() == title.lower():
150 | return self.get_info_by_bmrxivid(item_doi)
151 |
152 | return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()]
153 | except:
154 | logger.error("Title: {} is error.".format(title))
155 |
156 |
157 | if __name__ == "__main__":
158 |
159 | arxivId = "10.1101/2022.07.28.22277637"
160 | # title = "Oxygen restriction induces a viable but non-culturable population in bacteria"
161 | # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing"
162 | # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19"
163 | # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome"
164 |
165 | arxiv_info = BMxivInfo()
166 | arxiv_info.set_proxy(proxy="127.0.1:1123")
167 |
168 | bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId)
169 | # bib_title = arxiv_info.get_info_by_title(title)
170 |
171 | print(bib_arxiv)
172 | print("\n")
173 | # print(bib_title)
--------------------------------------------------------------------------------
/build/lib/easy_literature/pdfs.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | from urllib.parse import urlunsplit, urlsplit
4 | from bs4 import BeautifulSoup
5 |
6 | logging.basicConfig()
7 | logger = logging.getLogger('PDFs')
8 | logger.setLevel(logging.DEBUG)
9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
10 |
11 |
12 | class pdfDownload(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 |
17 | def set_proxy(self, proxy=None):
18 | """set proxy for session
19 |
20 | Args:
21 | proxy (str): The proxy adress. e.g 127.0.1:1123
22 | Returns:
23 | None
24 | """
25 | if proxy:
26 | self.sess.proxies = {
27 | "http": proxy,
28 | "https": proxy, }
29 |
30 |
31 | def _get_available_scihub_urls(self):
32 | '''
33 | Finds available scihub urls via https://lovescihub.wordpress.com/ or
34 | https://sci-hub.now.sh/
35 | '''
36 | urls = []
37 | res = self.sess.get('https://lovescihub.wordpress.com/')
38 | s = BeautifulSoup(res.content, 'html.parser')
39 | for a in s.find('div', class_="entry-content").find_all('a', href=True):
40 | if 'sci-hub.' in a['href']:
41 | urls.append(a['href'])
42 | return urls
43 |
44 |
45 | def fetch(self, url, auth=None):
46 | '''Fetch pdf
47 |
48 | Args:
49 | url (str):
50 |
51 | Returns:
52 | A dict OR None
53 | '''
54 | try:
55 | r = self.sess.get(url, auth=auth)
56 |
57 | if r.headers["Content-Type"] != "application/pdf":
58 | logger.info("Failed to fetch pdf with url: {}".format(url))
59 | else:
60 | return {
61 | 'pdf': r.content,
62 | 'url': url
63 | }
64 | except:
65 | logger.error("Failed to open url: {}".format(url))
66 |
67 |
68 | def get_pdf_from_direct_url(self, url, auth=None):
69 | return self.fetch(url, auth=auth)
70 |
71 |
72 | def get_pdf_from_sci_hub(self, identifier, auth=None):
73 | '''Fetch pdf from sci-hub based on doi or url
74 |
75 | Args:
76 | identifier (str): DOI or url
77 | auth (tuple): ("user", "passwd")
78 |
79 | Returns:
80 | A dict OR None
81 | '''
82 | for base_url in self._get_available_scihub_urls():
83 | r = self.sess.get(base_url + '/' + identifier, auth=auth)
84 | soup = BeautifulSoup(r.content, 'html.parser')
85 |
86 | pdf_div_names = ['iframe', 'embed']
87 | for pdf_div_name in pdf_div_names:
88 | pdf_div = soup.find(pdf_div_name)
89 | if pdf_div != None:
90 | break
91 | try:
92 | url_parts = urlsplit(pdf_div.get('src'))
93 | if url_parts[1]:
94 | if url_parts[0]:
95 | pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', ''))
96 | else:
97 | pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', ''))
98 | else:
99 | pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', ''))
100 |
101 | return self.fetch(pdf_url, auth)
102 | except:
103 | pass
104 |
105 | logger.info("Failed to fetch pdf with all sci-hub urls")
106 |
107 | def _save(self, content, path):
108 | with open(path, "wb") as f:
109 | f.write(content)
110 |
111 |
112 | if __name__ == "__main__":
113 | doi = "10.1145/3308558.3313562"
114 |
115 | pdf_download = pdfDownload()
116 | pdf_download.set_proxy("127.0.1:1123")
117 |
118 | pdf_dict = pdf_download.get_pdf_from_sci_hub(doi)
119 | if pdf_dict:
120 | print(pdf_dict['url'])
121 | pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf")
122 |
123 | # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf")
124 | # if pdf_dict2:
125 | # print(pdf_dict2['url'])
126 | # pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf")
127 |
128 |
--------------------------------------------------------------------------------
/build/lib/easy_literature/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import re
4 | from tqdm import tqdm
5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid, classify
6 |
7 |
8 | logging.basicConfig()
9 | logger = logging.getLogger('utils')
10 | logger.setLevel(logging.INFO)
11 |
12 |
13 | class patternRecognizer(object):
14 | def __init__(self, regular_rule):
15 | self.pattern = re.compile(regular_rule)
16 |
17 | def match(self, string):
18 | return self.pattern.match(string)
19 |
20 | def findall(self, string):
21 | return self.pattern.findall(string)
22 |
23 | def multiple_replace(self, content, **replace_dict):
24 | def replace_(value):
25 | match = value.group()
26 | if match in replace_dict.keys():
27 | return replace_dict[match]
28 | else:
29 | return match+" **Not Correct, Check it. Maybe mannual update & download is needed.**"
30 |
31 | replace_content = self.pattern.sub(replace_, content)
32 |
33 | return replace_content
34 |
35 |
36 | def note_modified(pattern_recog, md_file, **replace_dict):
37 | with open(md_file, 'r') as f:
38 | content = f.read()
39 |
40 | replaced_content = pattern_recog.multiple_replace(content, **replace_dict)
41 |
42 | with open(md_file, 'w') as f:
43 | f.write(''.join(replaced_content))
44 |
45 |
46 | def get_pdf_paths(pdf_root):
47 | pdf_paths = []
48 | for root, _, files in os.walk(pdf_root):
49 | for file in files:
50 | if file.lower().endswith('.pdf'):
51 | pdf_paths.append(os.path.join(root, file))
52 |
53 | return pdf_paths
54 |
55 |
56 | def get_pdf_paths_from_notes(md_root, reg):
57 |
58 | md_files = []
59 | for root, _, files in os.walk(md_root):
60 | for file in files:
61 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
62 | md_files.append(os.path.join(root, file))
63 |
64 | pdf_paths_from_notes = []
65 | for md_file in md_files:
66 | with open(md_file, 'r') as f:
67 | content = f.read()
68 | m = reg.findall(content)
69 | m = [i.split("(")[-1].split(')')[0] for i in m]
70 | pdf_paths_from_notes.extend(m)
71 |
72 | return pdf_paths_from_notes
73 |
74 |
75 | def get_pdf_paths_from_notes_dict(md_root, reg):
76 | pdf_paths_from_notes_dict = {}
77 | if os.path.isdir(md_root):
78 | md_files = []
79 | for root, _, files in os.walk(md_root):
80 | for file in files:
81 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
82 | md_files.append(os.path.join(root, file))
83 |
84 | for md_file in md_files:
85 | with open(md_file, 'r') as f:
86 | content = f.read()
87 | m = reg.findall(content)
88 | m = [i.split("(")[-1].split(')')[0] for i in m]
89 | pdf_paths_from_notes_dict[md_file] = m
90 | else:
91 | with open(md_root, 'r') as f:
92 | content = f.read()
93 | m = reg.findall(content)
94 | m = [i.split("(")[-1].split(')')[0] for i in m]
95 | pdf_paths_from_notes_dict[md_root] = m
96 |
97 | return pdf_paths_from_notes_dict
98 |
99 |
100 | def classify_identifier(identifier):
101 | """Not need to download PDF file
102 | """
103 | if identifier.endswith("}}"):
104 | return True
105 | else:
106 | return False
107 |
108 |
109 | def get_update_content(m, note_file, pdfs_path, proxy, gproxy_mode):
110 |
111 | replace_dict = dict()
112 | for literature in tqdm(m):
113 | pdf = classify_identifier(literature)
114 |
115 | literature_id = literature.split('{')[-1].split('}')[0]
116 | bib = get_paper_info_from_paperid(literature_id, proxy=proxy, gproxy_mode=gproxy_mode)
117 |
118 | if bib:
119 | try:
120 | pdf_name = bib['title']
121 | # remove blank symbol, like \n, \t, \r
122 | pdf_name = re.sub(r'[\n\t\r]', '', pdf_name)
123 | # remove multiple blank spaces
124 | pdf_name = re.sub(r' +', ' ', pdf_name)
125 | pdf_name = re.sub(r'[.]', '', pdf_name)
126 |
127 | pdf_name = '_'.join(pdf_name.split(' ')) + '.pdf'
128 |
129 | # remove the special characters in the pdf name: / \ : * ? " < > |
130 | pdf_name = re.sub(r'[\\/:*?"<>|]', '', pdf_name)
131 | pdf_path = os.path.join(pdfs_path, pdf_name)
132 |
133 | logger.info(f"The pdf path to be saved: {pdf_path}")
134 | if pdf:
135 | id_type = classify(literature_id)
136 | if id_type == "title":
137 | for pattern_str in [r'10\.(?!1101)[0-9]{4}/', r'10\.1101/', r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}', r'.*/[0-9]{2}[0-1][0-9]{4}']:
138 | res = re.search(pattern_str, bib['url']) # search for the arxiv id in the url
139 | if res:
140 | literature_id = res.group(0)
141 | if bib['pdf_link'] is None:
142 | bib['pdf_link'] = f'https://arxiv.org/pdf/{literature_id}.pdf'
143 | logger.info(f"The paper's arxiv url: {bib['url']}; The converted arxiv id: {literature_id}; The pdf link: {bib['pdf_link']}.")
144 | if not os.path.exists(pdf_path):
145 | logger.info(f"PDF link: {bib['pdf_link']}")
146 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy)
147 | if not os.path.exists(pdf_path):
148 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy)
149 | else:
150 | if not os.path.exists(pdf_path):
151 | logger.info(f"PDF link: {bib['pdf_link']}")
152 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy)
153 | if not os.path.exists(pdf_path):
154 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy)
155 | if os.path.exists(pdf_path):
156 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
157 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
158 | bib['year'], bib['cited_count'], os.path.relpath(pdf_path, note_file).split('/',1)[-1],
159 | bib['url'])
160 | else:
161 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
162 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
163 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
164 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url']
165 | )
166 | replace_dict[literature] = replaced_literature
167 | except:
168 |
169 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
170 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
171 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
172 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url']
173 | )
174 | replace_dict[literature] = replaced_literature
175 | else:
176 | logger.info("Can not find the literature {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
177 | replaced_literature = "- **{}**. ([pdf]({})).".format(
178 | literature_id, f'{pdfs_path}/your_pdf_name.pdf'
179 | )
180 | replace_dict[literature] = replaced_literature
181 | return replace_dict
--------------------------------------------------------------------------------
/easy_literature/DBLP.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from urllib.request import ProxyHandler
3 | from . import dblp_source as dblp
4 | import pandas as pd
5 |
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('DBLP')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class DBLPInfo(object):
13 |
14 | def set_proxy(self, proxy_address = None):
15 | """set proxy handler
16 |
17 | Aargs:
18 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
19 |
20 | Returns:
21 | A proxy handler object.
22 | """
23 | pass
24 |
25 |
26 | def extract_json_info(self, item):
27 | """Extract bib json information from requests.get().json()
28 |
29 | Args:
30 | item (json object): obtained by requests.get().json()
31 |
32 | Returns:
33 | A dict containing the paper information.
34 | """
35 | trial_num = 0
36 | while trial_num<10:
37 | trial_num+=1
38 | try:
39 | results = dblp.search([item])
40 | break
41 | except:
42 | if trial_num == 10:
43 | results = pd.DataFrame({'A' : []})
44 | else:
45 | pass
46 |
47 |
48 |
49 | if not results.empty:
50 | if 'CoRR' in [str(venue) for venue in results['Where']]:
51 | journal = 'CoRR'
52 | for venue in results['Where']:
53 | if str(venue) != 'CoRR':
54 | journal = str(venue)
55 | break
56 |
57 | str(results['Where'])
58 | bib_dict = {
59 | "title": str(results['Title'][0]),
60 | "author": ' and '.join([str(Entry) for Entry in results['Authors'][0]]),
61 | "journal": journal,
62 | "year": str(results['Year'][0]),
63 | "url": str(results['Link'][0]),
64 | "pdf_link": None,
65 | "cited_count": None
66 | }
67 | else:
68 | bib_dict = None
69 | return bib_dict
70 |
71 |
72 | def get_info_by_title(self, title):
73 | """Get the meta information by the given paper title.
74 |
75 | Args:
76 | doi (str): The paper title
77 |
78 | Returns:
79 | A dict containing the paper information.
80 | {
81 | "title": xxx,
82 | "author": xxx,
83 | "journal": xxx,
84 | etc
85 | }
86 | OR
87 | None
88 | OR
89 | A list [{}, {}, {}]
90 | """
91 | return self.extract_json_info(title)
92 |
93 |
94 | if __name__ == "__main__":
95 | # arxivId = "2208.05623"
96 | # title = "Heterogeneous Graph Attention Network"
97 |
98 | # gscholar_info = GscholarInfo()
99 | # gscholar_info.set_proxy(proxy_name='single')
100 |
101 | # bib_arxiv = gscholar_info.get_info_by_title(title)
102 | # # bib_title = arxiv_info.get_info_by_title(title)
103 |
104 | # print(bib_arxiv)
105 | # print("\n")
106 | # # print(bib_title)
107 | results = dblp.search(["Finetunedlanguage models are zero-shot learners"])
108 |
109 | print(results)
--------------------------------------------------------------------------------
/easy_literature/GoogleScholar.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from scholarly import scholarly, ProxyGenerator
3 |
4 |
5 | logging.basicConfig()
6 | logger = logging.getLogger('GoogleScholar')
7 | logger.setLevel(logging.DEBUG)
8 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
9 |
10 | class GscholarInfo(object):
11 |
12 | def set_proxy(self, proxy_name = "free", proxy_address = None):
13 | """set proxy handler
14 |
15 | Aargs:
16 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
17 |
18 | Returns:
19 | A proxy handler object.
20 | """
21 | # TODO find a better proxy strategy
22 | if proxy_address:
23 | sucess = False
24 | pg = ProxyGenerator()
25 | if proxy_name == "free":
26 | sucess = pg.FreeProxies()
27 | elif proxy_name == "single":
28 | sucess = pg.SingleProxy(http = proxy_address, https = proxy_address)
29 | elif proxy_name == "Scraper":
30 | sucess = pg.ScraperAPI('a44bd5be9f56b1be9d6e40116ea4b440')
31 | logger.info(f'Scholarly using {proxy_name} proxy.')
32 | logger.info(f'Proxy setup sucess: {sucess}.')
33 | scholarly.use_proxy(pg)
34 |
35 |
36 | def extract_json_info(self, item):
37 | """Extract bib json information from requests.get().json()
38 |
39 | Args:
40 | item (json object): obtained by requests.get().json()
41 |
42 | Returns:
43 | A dict containing the paper information.
44 | """
45 | bib_dict = None
46 | trial_num = 0
47 |
48 | while trial_num<9:
49 | try:
50 | trial_num+=1
51 | pubs_iter = scholarly.search_pubs(item)
52 | dictinfo = next(pubs_iter)
53 | # logger.info(dictinfo)
54 | bib_dict = {
55 | "title": dictinfo['bib']['title'].replace('\n', ''),
56 | "author": ' and '.join(dictinfo['bib']['author']),
57 | "journal": dictinfo['bib']['venue'],
58 | "year": dictinfo['bib']['pub_year'],
59 | "url": dictinfo['pub_url'],
60 | "pdf_link": dictinfo['eprint_url'],
61 | "cited_count": dictinfo['num_citations']
62 | }
63 | break
64 | except:
65 | pass
66 |
67 | return bib_dict
68 |
69 |
70 |
71 | def get_info_by_title(self, title):
72 | """Get the meta information by the given paper title.
73 |
74 | Args:
75 | doi (str): The paper title
76 |
77 | Returns:
78 | A dict containing the paper information.
79 | {
80 | "title": xxx,
81 | "author": xxx,
82 | "journal": xxx,
83 | etc
84 | }
85 | OR
86 | None
87 | OR
88 | A list [{}, {}, {}]
89 | """
90 | return self.extract_json_info(title)
91 |
92 |
93 | if __name__ == "__main__":
94 | arxivId = "2208.05623"
95 | title = "Heterogeneous Graph Attention Network"
96 |
97 | gscholar_info = GscholarInfo()
98 | gscholar_info.set_proxy(proxy_name='free')
99 |
100 | bib_arxiv = gscholar_info.get_info_by_title(title)
101 | # bib_title = arxiv_info.get_info_by_title(title)
102 |
103 | print(bib_arxiv)
104 | print("\n")
105 | # print(bib_title)
--------------------------------------------------------------------------------
/easy_literature/Scholarly.py:
--------------------------------------------------------------------------------
1 | import json
2 | from scholarly import scholarly
3 | from scholarly import ProxyGenerator
4 |
5 | # Set up a ProxyGenerator object to use free proxies
6 | # This needs to be done only once per session
7 | pg = ProxyGenerator()
8 |
9 | sucess = pg.FreeProxies()
10 | # print(f'Proxy setup sucess: {sucess}.')
11 | scholarly.use_proxy(pg)
12 |
13 | # will paginate to the next page by default
14 | pubs_iter = scholarly.search_pubs("1810.04805")
15 |
16 |
17 | print(json.dumps(next(pubs_iter), indent=2))
18 |
--------------------------------------------------------------------------------
/easy_literature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/easy_literature/__init__.py
--------------------------------------------------------------------------------
/easy_literature/arxiv.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from urllib.request import ProxyHandler
3 | import feedparser
4 | try:
5 | from urllib import quote
6 | except ImportError:
7 | from urllib.parse import quote
8 | from unidecode import unidecode
9 |
10 | from .crossref import crossrefInfo
11 |
12 |
13 | logging.basicConfig()
14 | logger = logging.getLogger('arxiv')
15 | logger.setLevel(logging.DEBUG)
16 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
17 |
18 | class arxivInfo(object):
19 | def __init__(self):
20 | self.base_url = "http://export.arxiv.org/api/query"
21 |
22 | def set_proxy_handler(self, proxy):
23 | """set proxy handler
24 |
25 | Aargs:
26 | proxy (str): proxy (str): The proxy adress. e.g 127.0.1:1123
27 |
28 | Returns:
29 | A proxy handler object.
30 | """
31 | proxy_handler = ProxyHandler({"http": f"http://{proxy}",
32 | "https": f"https://{proxy}"})
33 | return proxy_handler
34 |
35 |
36 | def extract_json_info(self, item):
37 | """Extract bib json information from requests.get().json()
38 |
39 | Args:
40 | item (json object): obtained by requests.get().json()
41 |
42 | Returns:
43 | A dict containing the paper information.
44 | """
45 | paper_url = item.link
46 | title = item.title
47 | journal = "arxiv"
48 | published = item.published.split("-")
49 | if len(published) > 1:
50 | year = published[0]
51 | else:
52 | year = ' '
53 |
54 | authors = item.authors
55 | if len(authors) > 0:
56 | first_author = authors[0]["name"].split(" ")
57 | authors = " and ".join([author["name"] for author in authors])
58 | else:
59 | first_author = authors
60 | authors = authors
61 |
62 | bib_dict = {
63 | "title": title,
64 | "author": authors,
65 | "journal": journal,
66 | "year": year,
67 | "url": paper_url,
68 | "pdf_link": item.link.replace("abs", "pdf")+".pdf",
69 | "cited_count": None
70 | }
71 |
72 | return bib_dict
73 |
74 |
75 | def get_info_by_arxivid(self, arxivId, handler=False):
76 | """Get the meta information by the given paper arxiv_id.
77 |
78 | Args:
79 | doi (str): The arxiv Id
80 | handler (handler object): use proxy
81 |
82 | Returns:
83 | A dict containing the paper information.
84 | {
85 | "title": xxx,
86 | "author": xxx,
87 | "journal": xxx,
88 | etc
89 | }
90 | OR
91 | None
92 | """
93 |
94 | params = "?search_query=id:"+quote(unidecode(arxivId))
95 |
96 | try:
97 | if handler:
98 | result = feedparser.parse(self.base_url + params, handlers=[handler])
99 | else:
100 | result = feedparser.parse(self.base_url + params)
101 | items = result.entries
102 |
103 | item = items[0]
104 | if "arxiv_doi" in item:
105 | doi = item["arxiv_doi"]
106 |
107 | crossref_info = crossrefInfo()
108 | if handler:
109 | crossref_info.set_proxy(proxy=handler.proxies["http"].split('//')[-1])
110 | return crossref_info.get_info_by_doi(doi)
111 | else:
112 | return self.extract_json_info(item)
113 | except:
114 | logger.error("DOI: {} is error.".format(arxivId))
115 |
116 |
117 | def get_info_by_title(self, title, field='ti'):
118 | """Get the meta information by the given paper title.
119 |
120 | Args:
121 | doi (str): The paper title
122 |
123 | Returns:
124 | A dict containing the paper information.
125 | {
126 | "title": xxx,
127 | "author": xxx,
128 | "journal": xxx,
129 | etc
130 | }
131 | OR
132 | None
133 | OR
134 | A list [{}, {}, {}]
135 | """
136 | params = "?search_query="+field+":"+quote(unidecode(title))
137 | url = self.base_url + params
138 | try:
139 | result = feedparser.parse(url)
140 | items = result.entries
141 | print(len(items))
142 |
143 | for i, item in enumerate(items):
144 |
145 | title_item = item.title
146 | try:
147 | title_item = title_item.decode("utf-8")
148 | except:
149 | pass
150 |
151 | item.title = title_item
152 |
153 | if title_item.lower() == title.lower():
154 | return self.extract_json_info(item)
155 |
156 | items[i] = item
157 |
158 | return [self.extract_json_info(it) for it in items]
159 | except:
160 | logger.error("Title: {} is error.".format(title))
161 |
162 |
163 | if __name__ == "__main__":
164 | arxivId = "2208.05623"
165 | title = "Heterogeneous Graph Attention Network"
166 |
167 | arxiv_info = arxivInfo()
168 | arxiv_info.set_proxy_handler(proxy="127.0.1:1123")
169 |
170 | bib_arxiv = arxiv_info.get_info_by_arxivid(arxivId)
171 |
172 | print(bib_arxiv)
173 | print("\n")
--------------------------------------------------------------------------------
/easy_literature/crossref.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | #
4 | # 1. get info by doi
5 | # 2. get info by title
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('crossref')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class crossrefInfo(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 | self.base_url = "http://api.crossref.org/"
17 |
18 | def set_proxy(self, proxy=None):
19 | """set proxy for session
20 |
21 | Args:
22 | proxy (str): The proxy adress. e.g 127.0.1:1123
23 | Returns:
24 | None
25 | """
26 | if proxy:
27 | self.sess.proxies = {
28 | "http": proxy,
29 | "https": proxy, }
30 |
31 |
32 | def extract_json_info(self, bib):
33 | """Extract bib json information from requests.get().json()
34 |
35 | Args:
36 | bib (json object): obtained by requests.get().json()
37 |
38 | Returns:
39 | A dict containing the paper information.
40 | """
41 | pub_date = [str(i) for i in bib['published']["date-parts"][0]]
42 | pub_date = '-'.join(pub_date)
43 |
44 | if 'author' in bib.keys():
45 | authors = ' and '.join([i["family"]+" "+i['given'] for i in bib['author'] if "family" and "given" in i.keys()])
46 | else:
47 | authors = "No author"
48 |
49 | if 'short-container-title' in bib.keys():
50 | try:
51 | journal = bib['short-container-title'][0]
52 | except:
53 | journal = "No journal"
54 | else:
55 | try:
56 | journal = bib['container-title'][0]
57 | except:
58 | journal = "No journal"
59 |
60 | bib_dict = {
61 | "title": bib['title'][0],
62 | "author": authors,
63 | "journal": journal,
64 | "year": pub_date,
65 | "url": bib["URL"],
66 | "pdf_link": bib["link"][0]["URL"],
67 | "cited_count": bib["is-referenced-by-count"]
68 | }
69 |
70 | return bib_dict
71 |
72 |
73 | def get_info_by_doi(self, doi):
74 | """Get the meta information by the given paper DOI number.
75 |
76 | Args:
77 | doi (str): The paper DOI number
78 |
79 | Returns:
80 | A dict containing the paper information.
81 | {
82 | "title": xxx,
83 | "author": xxx,
84 | "journal": xxx,
85 | etc
86 | }
87 | OR
88 | None
89 | """
90 | url = "{}works/{}"
91 | url = url.format(self.base_url, doi)
92 |
93 | try:
94 | r = self.sess.get(url)
95 |
96 | bib = r.json()['message']
97 | return self.extract_json_info(bib)
98 |
99 | except:
100 | logger.error("DOI: {} is error.".format(doi))
101 |
102 |
103 | def get_info_by_title(self, title):
104 | """Get the meta information by the given paper title.
105 |
106 | Args:
107 | doi (str): The paper title
108 |
109 | Returns:
110 | A dict containing the paper information.
111 | {
112 | "title": xxx,
113 | "author": xxx,
114 | "journal": xxx,
115 | etc
116 | }
117 | OR
118 | None
119 | OR
120 | A list [{}, {}, {}]
121 | """
122 | url = self.base_url + "works"
123 | params = {"query.bibliographic": title, "rows": 20}
124 | try:
125 | r = self.sess.get(url, params=params)
126 | items = r.json()["message"]["items"]
127 |
128 | for i, item in enumerate(items):
129 |
130 | title_item = item['title'][0]
131 | try:
132 | title_item = title_item.decode("utf-8")
133 | except:
134 | pass
135 |
136 | item["title"][0] = title_item
137 |
138 | if title_item.lower() == title.lower():
139 | return self.extract_json_info(item)
140 |
141 | items[i] = item
142 |
143 | return [self.extract_json_info(it) for it in items]
144 | except:
145 | logger.error("Title: {} is error.".format(title))
146 |
147 |
148 | if __name__ == "__main__":
149 | # doi = "10.1016/j.wneu.2012.11.074"
150 | # doi = "10.1093/cercor/bhac266"
151 | doi = "10.1038/s41467-022-29269-6"
152 | # title = "Heterogeneous Graph Attention Network"
153 | # title = "Learning to Copy Coherent Knowledge for Response Generation"
154 |
155 | crossref_info = crossrefInfo()
156 | crossref_info.set_proxy(proxy="127.0.1:1123")
157 |
158 | bib_doi = crossref_info.get_info_by_doi(doi)
159 | # bib_title = crossref_info.get_info_by_title(title)
160 |
161 | print(bib_doi)
162 | print("\n")
163 | # print(bib_title)
164 |
--------------------------------------------------------------------------------
/easy_literature/dblp_source.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import pandas as pd
3 | import requests
4 |
5 | #options
6 | STRINGS_FOR_TEST = ["Collaborative Writing"]
7 | DBLP_BASE_URL = 'http://dblp.uni-trier.de/'
8 | PUB_SEARCH_URL = DBLP_BASE_URL + "search/publ/"
9 |
10 |
11 | def query_db(pub_string=STRINGS_FOR_TEST):
12 | '''
13 | returns the BeautifulSoup object of a query to DBLP
14 |
15 | :param pub_string: A list of strings of keywords
16 | :return: BeautifulSoup: A BeautifulSoup Object
17 | '''
18 | resp = requests.get(PUB_SEARCH_URL, params={'q':pub_string})
19 | return BeautifulSoup(resp.content)
20 |
21 | def get_pub_data(pub):
22 | '''
23 | Extracts the information about a publication from a BeautifulSoup object
24 |
25 | :param pub: A BeautifulSoup Object with Publication Information
26 | :return: dict: All Information of this Publication
27 | '''
28 | ptype = 'nothing'
29 | link = 'nothing'
30 | authors = []
31 | title = 'nothing'
32 | where = 'nothing'
33 |
34 | if 'year' in pub.get('class'):
35 | # year is not always scrapable, except for this case. Might be done more elegantly
36 | return int(pub.contents[0])
37 | else:
38 | ptype = pub.attrs.get('class')[1]
39 | for content_item in pub.contents:
40 | class_of_content_item = content_item.attrs.get('class', [0])
41 | if 'data' in class_of_content_item:
42 | for author in content_item.findAll('span', attrs={"itemprop": "author"}):
43 | authors.append(author.text)
44 | title = content_item.find('span', attrs={"class": "title"}).text
45 | for where_data in content_item.findAll('span', attrs={"itemprop": "isPartOf"}):
46 | found_where = where_data.find('span', attrs={"itemprop": "name"})
47 | if found_where:
48 | where = found_where.text
49 | elif 'publ' in class_of_content_item:
50 | link = content_item.contents[0].find('a').attrs.get('href', "nothing")
51 |
52 | return {'Type': ptype,
53 | 'Link': link,
54 | 'Authors': authors,
55 | 'Title': title,
56 | 'Where': where}
57 |
58 | def search(search_string=STRINGS_FOR_TEST):
59 | '''
60 | returns the information found in a search query to dblp as a pandas dataframe.
61 | Shows the following information:
62 | - Authors
63 | - Link to Publication
64 | - Title
65 | - Type (Article, Proceedings etc.)
66 | - Where it was published
67 | - Year of publication
68 | :param search_string: A List of Strings of Keywords, that should be searched for
69 | :return: pd.DataFrame: A Dataframe with all data
70 | '''
71 | soup = query_db(search_string)
72 | pub_list_raw = soup.find("ul", attrs={"class": "publ-list"})
73 |
74 | pub_list_data = []
75 | curr_year = 0
76 | for child in pub_list_raw.children:
77 | pub_data = get_pub_data(child)
78 | if type(pub_data) == int:
79 | curr_year = pub_data
80 | else:
81 | pub_data['Year'] = curr_year
82 | pub_list_data.append(pub_data)
83 |
84 | return pd.DataFrame(pub_list_data)
85 |
--------------------------------------------------------------------------------
/easy_literature/downloads.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | import os
4 | import platform
5 |
6 | from .arxiv import arxivInfo
7 | from .crossref import crossrefInfo
8 | from .medbiorxiv import BMxivInfo
9 | from .GoogleScholar import GscholarInfo
10 | from .DBLP import DBLPInfo
11 | from .pdfs import pdfDownload
12 |
13 | # log config
14 | logging.basicConfig()
15 | logger = logging.getLogger('Downloads')
16 | logger.setLevel(logging.INFO)
17 |
18 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
19 |
20 |
21 |
22 | def check_string(re_exp, str):
23 | res = re.match(re_exp, str)
24 | if res:
25 | return True
26 | else:
27 | return False
28 |
29 | def classify(identifier):
30 | """
31 | Classify the type of paper_id:
32 | arxivId - arxivId
33 | doi - digital object identifier
34 | medbiorxivId - medrxiv or biorxiv id
35 | title - title
36 | """
37 | if check_string(r'10\.(?!1101)[0-9]{4}/\.*', identifier):
38 | return 'doi'
39 | elif check_string(r'10\.1101/\.*', identifier):
40 | return "medbiorxivId"
41 | elif check_string(r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}.*', identifier) or check_string(r'.*/[0-9]{2}[0-1][0-9]{4}', identifier):
42 | return 'arxivId'
43 | elif check_string(r'[a-zA-Z\d\.-/\s]*', identifier):
44 | return 'title'
45 | else:
46 | return "unrecognized"
47 |
48 | def get_paper_info_from_paperid(paper_id, proxy=None, gproxy_mode='free'):
49 | id_type = classify(paper_id)
50 |
51 | if id_type == "doi":
52 | logger.info('ID type: doi.')
53 | downloader = crossrefInfo()
54 | if proxy:
55 | downloader.set_proxy(proxy=proxy)
56 | bib_dict = downloader.get_info_by_doi(paper_id)
57 |
58 | elif id_type == "arxivId":
59 | logger.info('ID type: arixiv.')
60 | downloader = arxivInfo()
61 | if proxy:
62 | downloader.set_proxy_handler(proxy=proxy)
63 | bib_dict = downloader.get_info_by_arxivid(paper_id)
64 |
65 | elif id_type == "medbiorxivId":
66 | logger.info('ID type: medbiorxivId.')
67 | downloader = BMxivInfo()
68 | if proxy:
69 | downloader.set_proxy(proxy=proxy)
70 | bib_dict = downloader.get_info_by_bmrxivid(paper_id)
71 |
72 | elif id_type == "title":
73 | logger.info('ID type: title.')
74 | downloader1 = GscholarInfo()
75 | downloader1.set_proxy(proxy_name=gproxy_mode, proxy_address=proxy)
76 | bib_dict = downloader1.get_info_by_title(paper_id)
77 |
78 | downloader2 = DBLPInfo()
79 | downloader2.set_proxy(proxy_address=proxy)
80 | bib_dict1 = downloader2.get_info_by_title(paper_id)
81 |
82 | logger.info(f'The Google scholar bib: {bib_dict}; The DLBP bib: {bib_dict1}.')
83 |
84 | if bib_dict is not None and bib_dict1 is not None:
85 | bib_dict['journal'] = bib_dict1['journal']
86 | elif bib_dict is None and bib_dict1 is not None:
87 | bib_dict = bib_dict1
88 | elif bib_dict is None and bib_dict1 is None:
89 | logger.info('Title not found on DLBP and Google scholar.')
90 | else:
91 | pass
92 |
93 | try:
94 | return bib_dict
95 | except:
96 | pass
97 |
98 |
99 | def get_paper_pdf_from_paperid(paper_id, path, proxy=None, direct_url=None):
100 | pdf_downloader = pdfDownload()
101 | if proxy:
102 | pdf_downloader.set_proxy(proxy=proxy)
103 |
104 | if direct_url:
105 | content = pdf_downloader.get_pdf_from_direct_url(direct_url)
106 | if not content:
107 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
108 | else:
109 | content = pdf_downloader.get_pdf_from_sci_hub(paper_id)
110 | try:
111 | system = platform.system()
112 | if system == 'Windows':
113 | path = path.replace("/", "\\")
114 | pdf_dir = path.rsplit("\\", 1)[0]
115 | else:
116 | pdf_dir = path.rsplit("/", 1)[0]
117 | if not os.path.exists(pdf_dir):
118 | os.makedirs(pdf_dir)
119 | pdf_downloader._save(content['pdf'], path)
120 | except:
121 | pass
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/easy_literature/easyliter.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import argparse
3 | import os
4 |
5 | from .utils import patternRecognizer, note_modified, get_pdf_paths, get_pdf_paths_from_notes, get_update_content, get_pdf_paths_from_notes_dict
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('easyliter')
9 | logger.setLevel(logging.INFO)
10 |
11 |
12 |
13 |
14 | def set_args():
15 | parser = argparse.ArgumentParser(description='EasyLiterature')
16 | parser.add_argument('-i', '--input', required=True, type=str, default=None,
17 | help="The path to the note file or note file folder.")
18 | parser.add_argument('-o', '--output', type=str, default=None,
19 | help='Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.')
20 | parser.add_argument('-p', '--proxy', type=str, default=None,
21 | help='The proxy address. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.')
22 | parser.add_argument('-gp', '--gproxy_mode', type=str, default='free',
23 | help='The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).')
24 | parser.add_argument('-d', '--delete', action='store_true',
25 | help='Delete unreferenced attachments in notes. Use with caution, '
26 | 'when used, -i must be a folder path including all notes.')
27 | parser.add_argument('-m', '--migration', type=str, default=None,
28 | help="The pdf folder path you want to reconnect to.")
29 | args = parser.parse_args()
30 |
31 | return args
32 |
33 | def check_args():
34 | args = set_args()
35 | input_path = args.input
36 | output_path = args.output
37 | delete_bool = args.delete
38 | migration_path = args.migration
39 | proxy = args.proxy
40 | gproxy_mode = args.gproxy_mode
41 |
42 | return input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode
43 |
44 |
45 | def get_bib_and_pdf(note_file, output_path, proxy, paper_recognizer, gproxy_mode):
46 |
47 | pdfs_path = output_path
48 | if not os.path.exists(pdfs_path):
49 | os.makedirs(pdfs_path)
50 |
51 | with open(note_file, 'r') as f:
52 | content = f.read()
53 |
54 | m = paper_recognizer.findall(content)
55 | logger.info("Number of files to download - {}".format(len(m)))
56 |
57 | if not m:
58 | logger.info("The file {} is not found, or there is no valid entry in the file.".format(note_file))
59 | else:
60 | replace_dict = get_update_content(m, note_file, pdfs_path, proxy=proxy, gproxy_mode=gproxy_mode)
61 |
62 | return replace_dict
63 |
64 |
65 | def file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode):
66 |
67 | replace_dict = get_bib_and_pdf(input_path, output_path,
68 | proxy, paper_recognizer, gproxy_mode)
69 |
70 | if replace_dict:
71 | note_modified(paper_recognizer, input_path, **replace_dict)
72 |
73 |
74 | def main():
75 | input_path, output_path, delete_bool, proxy, migration_path, gproxy_mode = check_args()
76 |
77 | if output_path:
78 | paper_recognizer = patternRecognizer(r'- \{.{3,}\}')
79 |
80 | if os.path.isfile(input_path):
81 | logger.info("Updating the file {}".format(input_path))
82 | file_update(input_path, output_path, proxy, paper_recognizer, gproxy_mode)
83 |
84 | elif os.path.isdir(input_path):
85 | note_paths = []
86 | for root, _, files in os.walk(input_path):
87 | for file in files:
88 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
89 | note_paths.append(os.path.join(root, file))
90 | for note_path in note_paths:
91 | logger.info("Updating the file {}".format(note_path))
92 | file_update(note_path, output_path, proxy, paper_recognizer, gproxy_mode)
93 | else:
94 | logger.info("input path {} does not exist".format(input_path))
95 |
96 |
97 | # Delete unreferenced attachments
98 | if delete_bool:
99 | if os.path.isfile(input_path):
100 | logger.info("To delete the PDF entities unrelated to the notes, the input path must be the main notes folder!!! Please use this parameter with caution!!!")
101 | else:
102 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
103 | pdf_paths_in_notes = get_pdf_paths_from_notes(input_path, pdf_path_recognizer)
104 | pdf_paths = get_pdf_paths(output_path)
105 | # TODO the path between mac and win could be different,“/” 和 “\\”
106 | pdf_paths_in_notes = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths_in_notes]
107 | pdf_paths = [os.path.abspath(i).replace('\\', '/') for i in pdf_paths]
108 |
109 | removed_pdf_paths = list(set(pdf_paths) - set(pdf_paths_in_notes))
110 | try:
111 | for pdf_p in removed_pdf_paths:
112 | os.remove(pdf_p)
113 | except:
114 | pass
115 |
116 | logger.info("Deleted {} files".format(len(removed_pdf_paths)))
117 |
118 |
119 | if migration_path:
120 | pdf_path_recognizer = patternRecognizer(r'\[pdf\]\(.{5,}\.pdf\)')
121 |
122 | pdf_paths = get_pdf_paths(migration_path)
123 | pdf_paths_in_notes = get_pdf_paths_from_notes_dict(input_path, pdf_path_recognizer)
124 |
125 | # match based on paper title
126 | matched_numb = 0
127 | pdf_paths_dict = {os.path.basename(i): i for i in pdf_paths}
128 | for md_file, pdf_paths_ in pdf_paths_in_notes.items():
129 |
130 | pdf_paths_in_notes_dict = {os.path.basename(i): i for i in pdf_paths_}
131 | matched_pdfs = pdf_paths_dict.keys() & pdf_paths_in_notes_dict.keys()
132 |
133 | matched_numb += len(matched_pdfs)
134 |
135 | replace_paths_dict = {}
136 | for matched in matched_pdfs:
137 | replaced_str = os.path.relpath(pdf_paths_dict[matched], md_file).split('/',1)[-1]
138 | replaced_str = "[pdf]({})".format(replaced_str)
139 | ori_str = "[pdf]({})".format(pdf_paths_in_notes_dict[matched])
140 | replace_paths_dict[ori_str] = replaced_str
141 |
142 | if replace_paths_dict:
143 | note_modified(pdf_path_recognizer, md_file, **replace_paths_dict)
144 |
145 | logger.info("Found - {} - pdf files".format(matched_numb))
146 |
147 |
148 | if not output_path and not migration_path:
149 | logger.info("lacking the arguments -o or -m, use -h to see the help")
150 |
151 |
152 | if __name__ == "__main__":
153 | main()
--------------------------------------------------------------------------------
/easy_literature/medbiorxiv.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | from bs4 import BeautifulSoup
4 |
5 | from .crossref import crossrefInfo
6 |
7 | logging.basicConfig()
8 | logger = logging.getLogger('biorxiv')
9 | logger.setLevel(logging.DEBUG)
10 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
11 |
12 | class BMxivInfo(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 | self.base_url = "https://api.biorxiv.org/details/"
17 | self.servers = ["biorxiv", "medrxiv"]
18 |
19 |
20 | def set_proxy(self, proxy=False):
21 | """set proxy for session
22 |
23 | Args:
24 | proxy (str): The proxy adress. e.g 127.0.1:1123
25 | Returns:
26 | None
27 | """
28 | if proxy:
29 | self.sess.proxies = {
30 | "http": proxy,
31 | "https": proxy, }
32 |
33 |
34 | def extract_json_info(self, item):
35 | """Extract bib json information from requests.get().json()
36 |
37 | Args:
38 | item (json object): obtained by requests.get().json()
39 |
40 | Returns:
41 | A dict containing the paper information.
42 | """
43 | paper_url = f"https://www.biorxiv.org/content/{item['doi']}"
44 | title = item["title"]
45 | journal = item["server"]
46 | published = item["date"].split('-')
47 | if len(published) > 1:
48 | year = published[0]
49 | else:
50 | year = ' '
51 |
52 | authors = item['authors'].split("; ")
53 | if len(authors) > 0:
54 | authors = " and ".join([author for author in authors])
55 | else:
56 | authors = authors
57 |
58 | bib_dict = {
59 | "title": title,
60 | "author": authors,
61 | "journal": journal,
62 | "year": year,
63 | "url": paper_url,
64 | "pdf_link": f"{paper_url}.full.pdf",
65 | "cited_count": None
66 | }
67 |
68 | return bib_dict
69 |
70 |
71 | def get_info_by_bmrxivid(self, bmrxivid):
72 | """Get the meta information by the given paper biorxiv_id or medrxiv_id.
73 |
74 | Args:
75 | doi (str): The biorxiv or medrxiv Id
76 |
77 | Returns:
78 | A dict containing the paper information.
79 | {
80 | "title": xxx,
81 | "author": xxx,
82 | "journal": xxx,
83 | etc
84 | }
85 | OR
86 | None
87 | """
88 | urls = [self.base_url + server + "/" + bmrxivid for server in self.servers]
89 | for url in urls:
90 | try:
91 | r = self.sess.get(url)
92 |
93 | bib = r.json()['collection'][-1]
94 |
95 | if "published" in bib.keys() and bib['published'] != "NA":
96 | doi = bib["published"]
97 | print(doi)
98 | crossref_info = crossrefInfo()
99 | if len(self.sess.proxies) > 0:
100 | crossref_info.set_proxy(self.sess.proxies['http'].split('//')[-1])
101 | return crossref_info.get_info_by_doi(doi)
102 |
103 | return self.extract_json_info(bib)
104 |
105 | except:
106 | logger.error("DOI: {} is error.".format(bmrxivid))
107 |
108 |
109 | def get_info_by_title(self, title):
110 | """Get the meta information by the given paper title.
111 |
112 | Args:
113 | doi (str): The paper title
114 |
115 | Returns:
116 | A dict containing the paper information.
117 | {
118 | "title": xxx,
119 | "author": xxx,
120 | "journal": xxx,
121 | etc
122 | }
123 | OR
124 | None
125 | OR
126 | A list [{}, {}, {}]
127 | """
128 | base_url = "https://www.biorxiv.org/search/{}%20jcode%3Amedrxiv%7C%7Cbiorxiv%20numresults%3A25%20\sort%3Arelevance-rank%20\format_result%3Astandard"
129 | query = title.replace(' ', '%252B')
130 |
131 | url = base_url.format(query)
132 | try:
133 | result = self.sess.get(url)
134 | soup = BeautifulSoup(result.content, "lxml")
135 | soup_items = soup.find_all("div",class_="highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix")
136 |
137 | soup_dict = dict()
138 | for sp in soup_items:
139 | key = sp.find("a", class_="highwire-cite-linked-title").span.text
140 | value = sp.find("span", class_="highwire-cite-metadata-doi highwire-cite-metadata").text.split("org/")[-1].split("v")[0].replace(" ", "")
141 | soup_dict[key] = value
142 |
143 | for item_title, item_doi in soup_dict.items():
144 | try:
145 | item_title = item_title.decode("utf-8")
146 | except:
147 | pass
148 |
149 | if item_title.lower() == title.lower():
150 | return self.get_info_by_bmrxivid(item_doi)
151 |
152 | return [self.get_info_by_bmrxivid(it) for it in soup_dict.values()]
153 | except:
154 | logger.error("Title: {} is error.".format(title))
155 |
156 |
157 | if __name__ == "__main__":
158 |
159 | arxivId = "10.1101/2022.07.28.22277637"
160 | # title = "Oxygen restriction induces a viable but non-culturable population in bacteria"
161 | # title = "A molecular atlas of the human postmenopausal fallopian tube and ovary from single-cell RNA and ATAC sequencing"
162 | # title = "Radiographic Assessment of Lung Edema (RALE) Scores are Highly Reproducible and Prognostic of Clinical Outcomes for Inpatients with COVID-19"
163 | # title = "Untargeted metabolomics of COVID-19 patient serum reveals potential prognostic markers of both severity and outcome"
164 |
165 | arxiv_info = BMxivInfo()
166 | arxiv_info.set_proxy(proxy="127.0.1:1123")
167 |
168 | bib_arxiv = arxiv_info.get_info_by_bmrxivid(arxivId)
169 | # bib_title = arxiv_info.get_info_by_title(title)
170 |
171 | print(bib_arxiv)
172 | print("\n")
173 | # print(bib_title)
--------------------------------------------------------------------------------
/easy_literature/pdfs.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | from urllib.parse import urlunsplit, urlsplit
4 | from bs4 import BeautifulSoup
5 |
6 | logging.basicConfig()
7 | logger = logging.getLogger('PDFs')
8 | logger.setLevel(logging.DEBUG)
9 | HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}
10 |
11 |
12 | class pdfDownload(object):
13 | def __init__(self):
14 | self.sess = requests.Session()
15 | self.sess.headers = HEADERS
16 |
17 | def set_proxy(self, proxy=None):
18 | """set proxy for session
19 |
20 | Args:
21 | proxy (str): The proxy adress. e.g 127.0.1:1123
22 | Returns:
23 | None
24 | """
25 | if proxy:
26 | self.sess.proxies = {
27 | "http": proxy,
28 | "https": proxy, }
29 |
30 |
31 | def _get_available_scihub_urls(self):
32 | '''
33 | Finds available scihub urls via https://lovescihub.wordpress.com/ or
34 | https://sci-hub.now.sh/
35 | '''
36 | urls = []
37 | res = self.sess.get('https://lovescihub.wordpress.com/')
38 | s = BeautifulSoup(res.content, 'html.parser')
39 | for a in s.find('div', class_="entry-content").find_all('a', href=True):
40 | if 'sci-hub.' in a['href']:
41 | urls.append(a['href'])
42 | return urls
43 |
44 |
45 | def fetch(self, url, auth=None):
46 | '''Fetch pdf
47 |
48 | Args:
49 | url (str):
50 |
51 | Returns:
52 | A dict OR None
53 | '''
54 | try:
55 | r = self.sess.get(url, auth=auth)
56 |
57 | if r.headers["Content-Type"] != "application/pdf":
58 | logger.info("Failed to fetch pdf with url: {}".format(url))
59 | else:
60 | return {
61 | 'pdf': r.content,
62 | 'url': url
63 | }
64 | except:
65 | logger.error("Failed to open url: {}".format(url))
66 |
67 |
68 | def get_pdf_from_direct_url(self, url, auth=None):
69 | return self.fetch(url, auth=auth)
70 |
71 |
72 | def get_pdf_from_sci_hub(self, identifier, auth=None):
73 | '''Fetch pdf from sci-hub based on doi or url
74 |
75 | Args:
76 | identifier (str): DOI or url
77 | auth (tuple): ("user", "passwd")
78 |
79 | Returns:
80 | A dict OR None
81 | '''
82 | for base_url in self._get_available_scihub_urls():
83 | r = self.sess.get(base_url + '/' + identifier, auth=auth)
84 | soup = BeautifulSoup(r.content, 'html.parser')
85 |
86 | pdf_div_names = ['iframe', 'embed']
87 | for pdf_div_name in pdf_div_names:
88 | pdf_div = soup.find(pdf_div_name)
89 | if pdf_div != None:
90 | break
91 | try:
92 | url_parts = urlsplit(pdf_div.get('src'))
93 | if url_parts[1]:
94 | if url_parts[0]:
95 | pdf_url = urlunsplit((url_parts[0], url_parts[1], url_parts[2], '', ''))
96 | else:
97 | pdf_url = urlunsplit(('https', url_parts[1], url_parts[2], '', ''))
98 | else:
99 | pdf_url = urlunsplit(('https', urlsplit(base_url)[1], url_parts[2], '', ''))
100 |
101 | return self.fetch(pdf_url, auth)
102 | except:
103 | pass
104 |
105 | logger.info("Failed to fetch pdf with all sci-hub urls")
106 |
107 | def _save(self, content, path):
108 | with open(path, "wb") as f:
109 | f.write(content)
110 |
111 |
112 | if __name__ == "__main__":
113 | doi = "10.1145/3308558.3313562"
114 |
115 | pdf_download = pdfDownload()
116 | pdf_download.set_proxy("127.0.1:1123")
117 |
118 | pdf_dict = pdf_download.get_pdf_from_sci_hub(doi)
119 | if pdf_dict:
120 | print(pdf_dict['url'])
121 | pdf_download.download(pdf_dict['pdf'] ,"/home/admin/tmp.pdf")
122 |
123 | # pdf_dict2 = pdf_download.get_pdf_from_direct_url("https://arxiv.org/pdf/2208.05419.pdf")
124 | # if pdf_dict2:
125 | # print(pdf_dict2['url'])
126 | # pdf_download.download(pdf_dict2['pdf'] ,"/home/admin/tmp2.pdf")
127 |
128 |
--------------------------------------------------------------------------------
/easy_literature/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import re
4 | from tqdm import tqdm
5 | from .downloads import get_paper_info_from_paperid, get_paper_pdf_from_paperid, classify
6 |
7 |
8 | logging.basicConfig()
9 | logger = logging.getLogger('utils')
10 | logger.setLevel(logging.INFO)
11 |
12 |
13 | class patternRecognizer(object):
14 | def __init__(self, regular_rule):
15 | self.pattern = re.compile(regular_rule)
16 |
17 | def match(self, string):
18 | return self.pattern.match(string)
19 |
20 | def findall(self, string):
21 | return self.pattern.findall(string)
22 |
23 | def multiple_replace(self, content, **replace_dict):
24 | def replace_(value):
25 | match = value.group()
26 | if match in replace_dict.keys():
27 | return replace_dict[match]
28 | else:
29 | return match+" **Not Correct, Check it. Maybe mannual update & download is needed.**"
30 |
31 | replace_content = self.pattern.sub(replace_, content)
32 |
33 | return replace_content
34 |
35 |
36 | def note_modified(pattern_recog, md_file, **replace_dict):
37 | with open(md_file, 'r') as f:
38 | content = f.read()
39 |
40 | replaced_content = pattern_recog.multiple_replace(content, **replace_dict)
41 |
42 | with open(md_file, 'w') as f:
43 | f.write(''.join(replaced_content))
44 |
45 |
46 | def get_pdf_paths(pdf_root):
47 | pdf_paths = []
48 | for root, _, files in os.walk(pdf_root):
49 | for file in files:
50 | if file.lower().endswith('.pdf'):
51 | pdf_paths.append(os.path.join(root, file))
52 |
53 | return pdf_paths
54 |
55 |
56 | def get_pdf_paths_from_notes(md_root, reg):
57 |
58 | md_files = []
59 | for root, _, files in os.walk(md_root):
60 | for file in files:
61 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
62 | md_files.append(os.path.join(root, file))
63 |
64 | pdf_paths_from_notes = []
65 | for md_file in md_files:
66 | with open(md_file, 'r') as f:
67 | content = f.read()
68 | m = reg.findall(content)
69 | m = [i.split("(")[-1].split(')')[0] for i in m]
70 | pdf_paths_from_notes.extend(m)
71 |
72 | return pdf_paths_from_notes
73 |
74 |
75 | def get_pdf_paths_from_notes_dict(md_root, reg):
76 | pdf_paths_from_notes_dict = {}
77 | if os.path.isdir(md_root):
78 | md_files = []
79 | for root, _, files in os.walk(md_root):
80 | for file in files:
81 | if file.lower().endswith('md') or file.lower().endswith('markdown'):
82 | md_files.append(os.path.join(root, file))
83 |
84 | for md_file in md_files:
85 | with open(md_file, 'r') as f:
86 | content = f.read()
87 | m = reg.findall(content)
88 | m = [i.split("(")[-1].split(')')[0] for i in m]
89 | pdf_paths_from_notes_dict[md_file] = m
90 | else:
91 | with open(md_root, 'r') as f:
92 | content = f.read()
93 | m = reg.findall(content)
94 | m = [i.split("(")[-1].split(')')[0] for i in m]
95 | pdf_paths_from_notes_dict[md_root] = m
96 |
97 | return pdf_paths_from_notes_dict
98 |
99 |
100 | def classify_identifier(identifier):
101 | """Not need to download PDF file
102 | """
103 | if identifier.endswith("}}"):
104 | return True
105 | else:
106 | return False
107 |
108 |
109 | def get_update_content(m, note_file, pdfs_path, proxy, gproxy_mode):
110 |
111 | replace_dict = dict()
112 | for literature in tqdm(m):
113 | pdf = classify_identifier(literature)
114 |
115 | literature_id = literature.split('{')[-1].split('}')[0]
116 | bib = get_paper_info_from_paperid(literature_id, proxy=proxy, gproxy_mode=gproxy_mode)
117 |
118 | if bib:
119 | try:
120 | pdf_name = bib['title']
121 | # remove blank symbol, like \n, \t, \r
122 | pdf_name = re.sub(r'[\n\t\r]', '', pdf_name)
123 | # remove multiple blank spaces
124 | pdf_name = re.sub(r' +', ' ', pdf_name)
125 | pdf_name = re.sub(r'[.]', '', pdf_name)
126 |
127 | pdf_name = '_'.join(pdf_name.split(' ')) + '.pdf'
128 |
129 | # remove the special characters in the pdf name: / \ : * ? " < > |
130 | pdf_name = re.sub(r'[\\/:*?"<>|]', '', pdf_name)
131 | pdf_path = os.path.join(pdfs_path, pdf_name)
132 |
133 | logger.info(f"The pdf path to be saved: {pdf_path}")
134 | if pdf:
135 | id_type = classify(literature_id)
136 | if id_type == "title":
137 | for pattern_str in [r'10\.(?!1101)[0-9]{4}/', r'10\.1101/', r'[0-9]{2}[0-1][0-9]\.[0-9]{3,}', r'.*/[0-9]{2}[0-1][0-9]{4}']:
138 | res = re.search(pattern_str, bib['url']) # search for the arxiv id in the url
139 | if res:
140 | literature_id = res.group(0)
141 | if bib['pdf_link'] is None:
142 | bib['pdf_link'] = f'https://arxiv.org/pdf/{literature_id}.pdf'
143 | logger.info(f"The paper's arxiv url: {bib['url']}; The converted arxiv id: {literature_id}; The pdf link: {bib['pdf_link']}.")
144 | if not os.path.exists(pdf_path):
145 | logger.info(f"PDF link: {bib['pdf_link']}")
146 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy)
147 | if not os.path.exists(pdf_path):
148 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy)
149 | else:
150 | if not os.path.exists(pdf_path):
151 | logger.info(f"PDF link: {bib['pdf_link']}")
152 | get_paper_pdf_from_paperid(literature_id, pdf_path, direct_url=bib['pdf_link'], proxy=proxy)
153 | if not os.path.exists(pdf_path):
154 | get_paper_pdf_from_paperid(literature_id, pdf_path, proxy=proxy)
155 | if os.path.exists(pdf_path):
156 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
157 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
158 | bib['year'], bib['cited_count'], os.path.relpath(pdf_path, note_file).split('/',1)[-1],
159 | bib['url'])
160 | else:
161 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
162 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
163 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
164 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url']
165 | )
166 | replace_dict[literature] = replaced_literature
167 | except:
168 |
169 | logger.info("Can not find a downloading source for literature id {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
170 | replaced_literature = "- **{}**. {} et.al. **{}**, **{}**, **Number of Citations: **{}, ([pdf]({}))([link]({})).".format(
171 | bib['title'], bib["author"].split(" and ")[0], bib['journal'],
172 | bib['year'], bib['cited_count'], f'{pdfs_path}/your_pdf_name.pdf', bib['url']
173 | )
174 | replace_dict[literature] = replaced_literature
175 | else:
176 | logger.info("Can not find the literature {}. You may need to manually download this paper, a template has been generated in the markdown file. Put the pdf file in the folder you specified just now and add its name in the '(pdf)' of your markdown entry.".format(literature_id))
177 | replaced_literature = "- **{}**. ([pdf]({})).".format(
178 | literature_id, f'{pdfs_path}/your_pdf_name.pdf'
179 | )
180 | replace_dict[literature] = replaced_literature
181 | return replace_dict
--------------------------------------------------------------------------------
/easyliter.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: easyliter
3 | Version: 1.0.5
4 | Summary: EasyLiterature is a opensourced, Python-based command line tool for automatic literature management. Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically collect and refine its information in the markdown file, download the pdf to your local machine, and link the pdf to your paper in the markdown file. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver.
5 | Home-page: https://github.com/Psycoy/EasyLiterature
6 | Author: Oliver
7 | Author-email: olivernova1998@gmail.com
8 | License: AGPLv3
9 | Keywords: title,bibtex,arxiv,doi,science,scientific-journals
10 | Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
11 | Classifier: Intended Audience :: Science/Research
12 | Classifier: Programming Language :: Python :: 3
13 | Classifier: Topic :: Text Processing :: Markup
14 | Description-Content-Type: text/markdown
15 | License-File: LICENSE
16 |
17 | # EasyLiterature
18 | **EasyLiterature** is a Python-based command line tool for automatic literature management. Welcome star or contribute!
19 |
20 | Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically `collect and refine its information in the markdown file`, `download the pdf to your local machine`, and `link the pdf to your paper in the markdown file`. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver.
21 |
22 |
23 |
24 | **A demo of the entries in your markdown note:**
25 |
26 |
27 |
28 |
29 |
30 | Inspired by [Mu Li](https://www.bilibili.com/video/BV1nA41157y4), adapted from [autoLiterature](https://github.com/wilmerwang/autoLiterature).
31 | Compared to autoLiterature, **EasyLiterature** is much easier to use and supports a wider range of features, such as `title-based paper match`, `paper search and download on Google Scholar and DLBP` (the two main sites for scholars), `citation statistics`, `mannual information update assitant`, etc. **EasyLiterature covers almost all papers thanks to the support of Google Scholar and DLBP!**
32 |
33 | ___
34 |
35 | **中文版介绍:**
36 |
37 | **EasyLiterature** 是一个基于python的命令行文件管理工具,永久开源,欢迎star或contribute。
38 |
39 | 之前沐神(李沐)做过一期视频讲如何阅读文献和整理,我觉得讲得非常好,[链接](https://www.bilibili.com/video/BV1nA41157y4)。EasyLiterature基本基于沐神所述的这一流程实现,并丰富了其他功能。
40 |
41 | 简单来说,在 Markdown 文件中简单列出想要阅读的论文标题(或ID),它会自动收集并在Markdown文件中完善相关信息,下载论文的PDF到本地机器,并将PDF链接到Markdown文件中的论文。通过这样的流程,我们可以实现永久保存实时编辑的论文PDF和Markdown中的笔记,无论是在本地机器还是云端,并且方便论文一站式分类和管理。
42 |
43 |
44 |
45 | **markdown文件中的论文信息条目(示意):**
46 |
47 |
48 |
49 |
50 |
51 | 与之前的实现相比,EasyLiterature兼容之前实现的所有功能,并且支持更多功能,比如:1. 基于标题的论文匹配;2. Google Scholar和DLBP(全球两大主要paper数据库)的论文搜索和下载;3. 引用统计;4. 手动信息更新助手;5. 容错搜索匹配;等等。之前的实现由于数据库的限制,很多文章都找不到。**EasyLiterature得益于增加了Google Scholar和DLBP的支持,几乎覆盖了所有论文!**
52 |
53 |
54 |
55 | ## 1. A Simple Usage Example (一个简单的使用示例)
56 | 1. Have the python installed on your local machine (preferably >= 3.7).
57 | 2. Run `pip install easyliter` in your command line to install.
58 | 3. Prepare your markdown note file (e.g., `Note.md`).
**Attention:** You may need to download a markdown editor to create/edit this file. I am using [Typora](https://typora.io/), which is not totally free. You can also choose other alternatives.
59 | 4. List the formated papers titles in your markdown note file according to the Section 4 below (Recognition Rules). e.g.,
60 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
61 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
62 | **(pay attention to the space after ‘\-’)**
63 | 5. Create a folder to store the downloaded pdfs (e.g., `PDFs/`).
64 | 6. Run `easyliter -i -o `.
65 |
(Replace `` with the actual path to your markdown note file, `` with the actual path to your pdf folder)
66 |
e.g., `easyliter -i "/home/Note.md" -o "/home/PDFs"`
67 | 7. Your should able to see that the updated information and downloaded pdf files if no error is reported.
68 | 8. This is a simple and common use case. For other features, please read the below sections carefully and follow the instructions.
69 |
70 |
71 |
72 | **中文版示例**
73 |
74 | 1. 在您的本地机器上安装 Python(版本 >= 3.7)。
75 | 2. 在命令行中运行 `pip install easyliter` 进行安装。
76 | 3. 准备您的 markdown 笔记文件(例如,`Note.md`)。
**注意**: 您需要下载一个 markdown 编辑器来创建/编辑此文件。我使用的是[Typora](https://typora.io/),它不是完全免费的。您也可以选择其他替代产品。
77 | 4. 根据下面第4节(识别规则)在您的 markdown 笔记文件中列出格式化的论文标题。例如:
78 | \- {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.}}
79 | \- {{Xlnet: Generalized autoregressive pretraining for language understanding.}}
80 | **(注意‘\-’后的空格)**
81 | 5. 创建一个文件夹来存储下载的 pdf 文件(例如,`PDFs/`)。
82 | 6. 运行 `easyliter -i <您的 md 文件路径> -o <您的 pdf 文件夹路径>`。
83 |
**注意**:将 `<您的 md 文件路径>` 替换为您 markdown 笔记文件的实际路径,将 `<您的 pdf 文件夹路径>` 替换为您 pdf 文件夹的实际路径。
84 |
例如:`easyliter -i "/home/Note.md" -o "/home/PDFs"`
85 | 7. 如果没有报错,您应该能够看到更新的信息和下载的 pdf 文件。
86 | 8. 这是一个简单、常用的使用案例。有关其他功能或使用情形,请仔细阅读以下部分并按照说明操作。
87 |
88 | ## 2. Install (安装)
89 | ### pip install
90 | ```bash
91 | pip install easyliter
92 | or
93 | pip3 install easyliter
94 | ```
95 |
96 | ### install from source(to get the up-to-date version)
97 | ```bash
98 | git clone https://github.com/Psycoy/EasyLiterature.git
99 | cd EasyLiterature
100 | pip install -e .
101 | ```
102 |
103 | ## 3. Arguments(使用参数)
104 | ```bash
105 | easyliter
106 |
107 | optional arguments:
108 |
109 | -h, --help show this help message and exit
110 |
111 | -i INPUT, --input INPUT
112 | The path to the note file or note file folder.
113 |
114 | -o OUTPUT, --output OUTPUT
115 | Folder path to save paper pdfs and images. NOTE: MUST BE FOLDER.
116 |
117 | -p PROXY, --proxy PROXY
118 | The proxy. e.g. 127.0.0.1:1080. If this argument is specified, the google scholar will automatically use a free proxy (not necessarily using the specified proxy address). To use other proxies for google scholar, specify the -gp option. If you want to set up the proxies mannually, change the behaviour in GoogleScholar.set_proxy(). See more at https://scholarly.readthedocs.io/en/stable/ProxyGenerator.html.
119 |
120 | -gp GPROXY_MODE, --gproxy_mode GPROXY_MODE
121 | The proxy type used for scholarly. e.g., free, single, Scraper. (Note: 1. will automatically choose a free proxy address to use, which is free, but may not be fast. 2. will use the proxy address you specify. 3. is not free to use and need to buy the api key.).
122 |
123 | -d, --delete
124 | Delete unreferenced attachments in notes. Use with caution, when used, -i must be a folder path including all notes.
125 |
126 | -m MIGRATION, --migration MIGRATION
127 | The pdf folder path you want to reconnect to.
128 | ```
129 |
130 |
131 | ## 4. Recognition Rules (识别规则):
132 | - If the notes file contains `- {paper_id}`, it will download the information of that literature, but not the PDF.
133 | - If the notes file contains `- {{paper_id}}`, it will download both the information of that literature and the PDF.
134 |
135 | - Note: `paper_id` supports `article title`, published articles' `doi`, and pre-published articles' `arvix_id`, `biorvix_id`, and `medrvix_id`. It will try all the possible sources online.
136 |
137 | ___
138 |
139 | - 当笔记文件中包含 `- {paper_id}`时候,会下载该文献的信息,不下载PDF。
140 | - 当笔记文件中包含 `- {{paper_id}}`时候,会下载该文献的信息,以及PDF。
141 |
142 | - 注意:`paper_id` 支持`文章标题`,已发表文章的`doi`, 预发布文章的`arvix_id`, `biorvix_id`, `medrvix_id`。EasyLiterature会从多个数据库自动识别需要收集和下载的论文,几乎覆盖所有目前存在的论文。
143 |
144 |
145 | ## 5. Usage(使用)
146 | ### 5.1. Basic Usage(基本使用)
147 | Assuming `input` is the folder path of the literature notes (.md files) and `output` is the folder path where you want to save the PDFs.
148 |
149 | 假设`input`为文献笔记(md文件)的文件夹路径,`output`为要保存PDF的文件夹路径。
150 |
151 | ```bash
152 | # Update all md files in the input folder
153 | # 更新input文件夹下所有md文件
154 | easyliter -i input -o output
155 |
156 | # Only update the input/example.md file
157 | # 仅更新input/example.md文件
158 | easyliter -i input/example.md -o output
159 |
160 | # -d is an optional flag, when -i is a folder path, using -d will delete unrelated pdf files in the PDF folder from the literature notes content
161 | # -d 是个可选项,当 -i 是文件夹路径时候,使用 -d 会删除PDF文件夹下和文献笔记内容无关的pdf文件
162 | easyliter -i input -o output -d
163 | ```
164 |
165 | ### 5.2. Migrating Notes and PDF Files(笔记和pdf文件的迁移)
166 | When you need to move the literature notes or the PDF folder, the links to the PDFs in the literature notes might become unusable. You can use `-m` to re-link the PDF files with the literature notes.
167 |
168 | 当要移动文献笔记或者PDF文件夹的时候,文献笔记中的PDF链接可能会变的无法使用。可以使用`-m`来重新关联PDF文件和文献笔记。
169 |
170 | ```bash
171 | # Update all md files in the input folder
172 | # 更新input文件夹下所有md文件
173 | easyliter -i input -m movedPDFs/
174 |
175 | # Only update the input/example.md file
176 | # 仅更新input/example.md文件
177 | easyliter -i input/example.md -m movedPDFs/
178 | ```
179 |
180 | ## 6. Note (注意事项)
181 |
182 | 1. For users from China mainland, the Google Scholar feature may need a VPN to get it work (the citation function is based on the Google Scholar). If you don't have a VPN, some features may be lost.
183 |
184 | - 对于来自中国大陆的用户,Google Scholar相关功能可能需要 VPN 才能正常工作(引用功能基于 Google scholar)。如果没有挂VPN,某些功能可能会丢失,但不完全影响使用。
185 |
186 | 2. If your Google Scholar is not working (usually caused by too frequent requests of the Google Scholar API), try to set a proxy for it. Check out the help for `-p` and `-gp` options using `easyliter -h`. See more at the 'Using proxies' section of https://scholarly.readthedocs.io/en/stable/quickstart.html.
187 |
188 | - 如果Google Scholar 无法使用(通常由于对Google Scholar API的访问过于频繁),尝试为其设置代理。使用 easyliter -h 查看 -p 和 -gp 选项的帮助信息来设置代理。详见 https://scholarly.readthedocs.io/en/stable/quickstart.html 的 Using proxies部分。
189 |
--------------------------------------------------------------------------------
/easyliter.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE
2 | README.md
3 | setup.py
4 | easy_literature/DBLP.py
5 | easy_literature/GoogleScholar.py
6 | easy_literature/Scholarly.py
7 | easy_literature/__init__.py
8 | easy_literature/arxiv.py
9 | easy_literature/crossref.py
10 | easy_literature/dblp_source.py
11 | easy_literature/downloads.py
12 | easy_literature/easyliter.py
13 | easy_literature/medbiorxiv.py
14 | easy_literature/pdfs.py
15 | easy_literature/utils.py
16 | easyliter.egg-info/PKG-INFO
17 | easyliter.egg-info/SOURCES.txt
18 | easyliter.egg-info/dependency_links.txt
19 | easyliter.egg-info/entry_points.txt
20 | easyliter.egg-info/requires.txt
21 | easyliter.egg-info/top_level.txt
--------------------------------------------------------------------------------
/easyliter.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/easyliter.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | easyliter = easy_literature.easyliter:main
3 |
--------------------------------------------------------------------------------
/easyliter.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.11.1
2 | feedparser>=6.0.10
3 | urllib3>=1.26.11
4 | requests>=2.28.1
5 | tqdm>=4.64.0
6 | Unidecode>=1.3.4
7 | bibtexparser==1.4.0
8 | pandas
9 | scholarly
10 |
--------------------------------------------------------------------------------
/easyliter.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | easy_literature
2 |
--------------------------------------------------------------------------------
/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/figures/.DS_Store
--------------------------------------------------------------------------------
/figures/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JinjieNi/EasyLiterature/84ee3e8731430756c3b464d5906c8a1c4378e862/figures/demo.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.11.1
2 | feedparser>=6.0.10
3 | urllib3>=1.26.11
4 | requests>=2.28.1
5 | tqdm>=4.64.0
6 | Unidecode>=1.3.4
7 | pandas
8 | scholarly
9 | bibtexparser==1.4.0
10 | socksio
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open('README.md', 'r', encoding='UTF-8') as f:
4 | README_MD = f.read()
5 |
6 | setup(
7 | name="easyliter",
8 | version="1.0.5",
9 | description="EasyLiterature is a opensourced, Python-based command line tool for automatic literature management. Simply list the paper titles (or ids) you want to read in a markdown file and it will automatically collect and refine its information in the markdown file, download the pdf to your local machine, and link the pdf to your paper in the markdown file. You can forever keep your notes within the pdfs and mds on your local machine or cloud driver.",
10 | long_description=README_MD,
11 | long_description_content_type='text/markdown',
12 | url="https://github.com/Psycoy/EasyLiterature",
13 | classifiers=[
14 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
15 | "Intended Audience :: Science/Research",
16 | "Programming Language :: Python :: 3",
17 | "Topic :: Text Processing :: Markup",
18 | ],
19 | install_requires=["beautifulsoup4>=4.11.1", "feedparser>=6.0.10",
20 | "urllib3>=1.26.11","requests>=2.28.1",
21 | "tqdm>=4.64.0", "Unidecode>=1.3.4", "bibtexparser==1.4.0", "pandas", "scholarly"],
22 | entry_points={
23 | "console_scripts": [
24 | "easyliter = easy_literature.easyliter:main",
25 | ]
26 | },
27 | packages=find_packages(),
28 | license="AGPLv3",
29 | author="Oliver",
30 | author_email="jinjieni@outlook.com",
31 | keywords=["title", "bibtex", "arxiv", "doi", "science", "scientific-journals"],
32 | )
33 |
--------------------------------------------------------------------------------