├── .gitignore ├── code ├── zotero.py ├── oadoi.py ├── wikidata.py ├── doaj.py ├── orcid.py └── pyApiToolkit.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.pyc 3 | includes 4 | .project 5 | *.pyo 6 | *~ 7 | *.db 8 | staticfiles 9 | .env 10 | *.sublime-project 11 | *.sublime-workspace 12 | data/ 13 | notes.md 14 | include.py 15 | -------------------------------------------------------------------------------- /code/zotero.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | https://github.com/urschrei/pyzotero 6 | https://www.zotero.org/support/dev/client_coding/javascript_api 7 | http://pyzotero.readthedocs.io/en/latest/ 8 | """ 9 | 10 | import pyApiToolkit as at 11 | import os 12 | from pyzotero import zotero 13 | import include 14 | 15 | __author__ = "Stefan Kasberger" 16 | __copyright__ = "Copyright 2017" 17 | __license__ = "MIT" 18 | __version__ = "0.1" 19 | __maintainer__ = "Stefan Kasberger" 20 | __email__ = "mail@stefankasberger.at" 21 | __status__ = "Development" # 'Development', 'Production' or 'Prototype' 22 | 23 | ### GLOBAL ### 24 | 25 | DELAY_TIME = 5 # in seconds 26 | TS = at.get_timestring() 27 | # TS = '2015-10-28-14-59' 28 | 29 | ### FUNCTIONS ### 30 | 31 | ### MAIN ### 32 | 33 | if __name__ == "__main__": 34 | startTime = at.start_timer() 35 | 36 | rootFolder = at.get_root_folder() 37 | config = include.data['zotero'] 38 | at.setup_environment() 39 | data = {} 40 | 41 | # zotero API 42 | library_type = 'user' 43 | #library_type = 'group' 44 | zot = zotero.Zotero(config['zoteroID'], library_type, config['apiKey']) 45 | items = zot.top(limit=5) 46 | 47 | for item in items: 48 | print(item['data']) 49 | 50 | at.stop_timer(startTime) 51 | -------------------------------------------------------------------------------- /code/oadoi.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Documentation: https://oadoi.org/api 6 | """ 7 | 8 | import pyApiToolkit as at 9 | import os 10 | import include 11 | 12 | __author__ = "Stefan Kasberger" 13 | __copyright__ = "Copyright 2017" 14 | __license__ = "MIT" 15 | __version__ = "0.1" 16 | __maintainer__ = "Stefan Kasberger" 17 | __email__ = "mail@stefankasberger.at" 18 | __status__ = "Development" # 'Development', 'Production' or 'Prototype' 19 | 20 | ### GLOBAL ### 21 | 22 | DELAY_TIME = 5 # in seconds 23 | TS = at.get_timestring() 24 | # TS = '2015-10-28-14-59' 25 | 26 | ### FUNCTIONS ### 27 | 28 | def request_dois(dois): 29 | data = {} 30 | 31 | for doi in dois: 32 | data[doi] = at.request_query(baseUrl+doi+'?email='+config['email']) 33 | return data 34 | 35 | def save_to_files(data, rootFolder): 36 | i = 0 37 | for id in data: 38 | at.save_to_json(data[id], rootFolder+'/data/raw/json/oadoi_'+str(i)+'.json') 39 | i+=1 40 | 41 | ### MAIN ### 42 | 43 | if __name__ == "__main__": 44 | startTime = at.start_timer() 45 | 46 | rootFolder = at.get_root_folder() 47 | baseUrl = 'https://api.oadoi.org/' 48 | config = include.data['oadoi'] 49 | at.setup_environment() 50 | 51 | df = at.open_csv(rootFolder+'/data/raw/csv/oadoi.csv') 52 | 53 | # oadoi API 54 | data = request_dois(df['doi']) 55 | 56 | save_to_files(data, rootFolder) 57 | 58 | at.stop_timer(startTime) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Open Science API toolkit 2 | Python 3 scripts to access, create, distribute and publish open research data or data about open science works. 3 | 4 | **Requirements** 5 | - Pandas 6 | 7 | 8 | **include.py** 9 | 10 | This is a config file, where you store your personal information in a dictionaries. See more details at the specific wrappers on how to use this. This is the main part of the toolkit, where the used functionalities are developed in. 11 | 12 | ## pyApiToolkit.py 13 | 14 | Basic functionalities, which are used by the other scripts listed further below. 15 | 16 | ## API Wrapper 17 | 18 | ### oadoi.py 19 | 20 | Python wrapper to access the [oadoi.org API](https://oadoi.org/api). 21 | 22 | **include.py** 23 | 24 | ``` 25 | data = { 26 | 'oadoi': { 27 | 'email': 'EMAIL' 28 | } 29 | } 30 | ``` 31 | 32 | ### wikidata.py 33 | Python wrapper to access the [wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) API via the [wikidataintegrator](https://github.com/SuLab/WikidataIntegrator) module (must be installed to work). 34 | 35 | You have to have a wikidata account for this. 36 | 37 | **include.py** 38 | 39 | ``` 40 | data = { 41 | 'wikidata': { 42 | 'user': 'USERNAME', 43 | 'password': 'PASSWORD' 44 | } 45 | } 46 | ``` 47 | 48 | ### orcid.py 49 | Python wrapper to access the [ORCID](https://orcid.org/) API. 50 | 51 | ### zotero.py 52 | 53 | Wrapper for the [Zotero API](https://www.zotero.org/support/dev/client_coding/javascript_api). Ìt uses the [pyZotero](https://github.com/urschrei/pyzotero) python module, which must be installed to work. 54 | 55 | **include.py** 56 | 57 | ``` 58 | data = { 59 | 'zotero': { 60 | 'apiKey': 'API_KEY', 61 | 'zoteroID': 'ZOTEROID' 62 | } 63 | } 64 | ``` 65 | 66 | ### doaj.py 67 | Wrapper to access the [Digital Open Access Journal API](https://doaj.org/api/v1/docs). 68 | 69 | -------------------------------------------------------------------------------- /code/wikidata.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | """ 6 | 7 | import pyApiToolkit as at 8 | import os 9 | from wikidataintegrator import wdi_core, wdi_login 10 | import include 11 | 12 | __author__ = "Stefan Kasberger" 13 | __copyright__ = "Copyright 2017" 14 | __license__ = "MIT" 15 | __version__ = "0.1" 16 | __maintainer__ = "Stefan Kasberger" 17 | __email__ = "mail@stefankasberger.at" 18 | __status__ = "Development" # 'Development', 'Production' or 'Prototype' 19 | 20 | ### GLOBAL ### 21 | 22 | DELAY_TIME = 5 # in seconds 23 | TS = at.get_timestring() 24 | # TS = '2015-10-28-14-59' 25 | 26 | ### FUNCTIONS ### 27 | 28 | def login(user, password): 29 | login_instance = wdi_login.WDLogin(user=user, pwd=password) 30 | 31 | def query_item(itemID): 32 | return wdi_core.WDItemEngine(wd_item_id=itemID) 33 | 34 | def get_data(results): 35 | return results.get_wd_json_representation() 36 | 37 | def write_item(): 38 | # Search for and then edit/create new item 39 | wd_item = wdi_core.WDItemEngine(item_name='', domain='genes', data=[entrez_gene_id]) 40 | wd_item.write(login_instance) 41 | 42 | def save_to_files(data, rootFolder): 43 | for item in data: 44 | at.save_to_json(data[item], rootFolder+'/data/raw/json/wikidata_'+item+'.json') 45 | 46 | ### MAIN ### 47 | 48 | if __name__ == "__main__": 49 | startTime = at.start_timer() 50 | 51 | rootFolder = at.get_root_folder() 52 | config = include.data['wikidata'] 53 | at.setup_environment() 54 | data = {} 55 | 56 | # wikidata API 57 | login(config['user'], config['password']) 58 | 59 | df = at.open_csv(rootFolder+'/data/raw/csv/wikidata.csv') 60 | 61 | for item in df['item']: 62 | results = query_item(item) 63 | data[item] = get_data(results) 64 | 65 | save_to_files(data, rootFolder) 66 | 67 | #at.create_sqlite3_db(rootFolder+'/data/sqlite3/wikidata.db') 68 | 69 | at.stop_timer(startTime) 70 | -------------------------------------------------------------------------------- /code/doaj.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Documentation: https://doaj.org/api/v1/docs 6 | """ 7 | 8 | import pyApiToolkit as at 9 | import os 10 | 11 | __author__ = "Stefan Kasberger" 12 | __copyright__ = "Copyright 2017" 13 | __license__ = "MIT" 14 | __version__ = "0.1" 15 | __maintainer__ = "Stefan Kasberger" 16 | __email__ = "mail@stefankasberger.at" 17 | __status__ = "Development" # 'Development', 'Production' or 'Prototype' 18 | 19 | ### GLOBAL ### 20 | 21 | DELAY_TIME = 5 # in seconds 22 | TS = at.get_timestring() 23 | BASE_URL = 'http://doaj.org/api/v1/' 24 | # TS = '2015-10-28-14-59' 25 | 26 | ### FUNCTIONS ### 27 | 28 | def search_journals(query): 29 | return at.request_query(BASE_URL+'search/journals/'+query) 30 | 31 | def search_articles(query): 32 | return at.request_query(BASE_URL+'search/articles/'+query) 33 | 34 | def retrieve_article(articleId): 35 | return at.request_query(BASE_URL+'articles/'+articleId) 36 | 37 | def retrieve_journal_by_id(journalId): 38 | return at.request_query(BASE_URL+'journals/'+journalId) 39 | 40 | ### MAIN ### 41 | 42 | if __name__ == "__main__": 43 | startTime = at.start_timer() 44 | 45 | rootFolder = at.get_root_folder() 46 | at.setup_environment() 47 | 48 | 49 | # search articles 50 | #df = at.open_csv(rootFolder+'/data/raw/csv/doaj_searcharticles.csv') 51 | query = 'libya' 52 | dataSA = search_articles(query) 53 | at.save_to_json(dataSA, rootFolder+'/data/raw/json/doaj_searcharticles_'+query+'.json') 54 | 55 | # retrieve article 56 | #df = at.open_csv(rootFolder+'/data/raw/csv/doaj_articles.csv') 57 | articleId = '000011857dbc42afb0f1a8c7e35ab46f' 58 | dataRA = retrieve_article(articleId) 59 | at.save_to_json(dataRA, rootFolder+'/data/raw/json/doaj_retrievearticle_'+articleId+'.json') 60 | 61 | # search journals 62 | #df = at.open_csv(rootFolder+'/data/raw/csv/doaj_searchjournals.csv') 63 | query = 'geography' 64 | dataSJ = search_journals(query) 65 | at.save_to_json(dataSJ, rootFolder+'/data/raw/json/doaj_searchjournals_'+query+'.json') 66 | 67 | # retrieve journal by ID 68 | #df = at.open_csv(rootFolder+'/data/raw/csv/doaj_journalID.csv') 69 | #journalId = '2503-250X' 70 | #dataRJ = retrieve_journal_by_id(journalId) 71 | #at.save_to_json(dataRJ, rootFolder+'/data/raw/json/doaj_retrievejournal_'+journalId+'.json') 72 | 73 | at.stop_timer(startTime) 74 | -------------------------------------------------------------------------------- /code/orcid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | import pyApiToolkit as at 6 | 7 | __author__ = "Stefan Kasberger" 8 | __copyright__ = "Copyright 2017" 9 | __license__ = "MIT" 10 | __version__ = "0.1" 11 | __maintainer__ = "Stefan Kasberger" 12 | __email__ = "mail@stefankasberger.at" 13 | __status__ = "Development" # 'Development', 'Production' or 'Prototype' 14 | 15 | ### GLOBAL ### 16 | 17 | BASE_URL = 'http://pub.orcid.org/' 18 | API_VERSION = 'v1.1/' 19 | 20 | # make request to DKAN API 21 | def request_api(query, datatype): 22 | """ 23 | """ 24 | if datatype == 'json': 25 | acceptkey = 'application/orcid+json' # header: "Accept: application/orcid+json" 26 | elif datatype == 'html': 27 | acceptkey = 'text/html' # header: "Accept: text/html" 28 | else: 29 | acceptkey = 'application/orcid+xml' # header: "Accept: application/orcid+xml" 30 | 31 | headers = {'Accept': acceptkey} 32 | 33 | resp = requests.get(query, headers=headers) 34 | 35 | if resp.status_code != 200: 36 | # This means something went wrong. 37 | raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 38 | 39 | return resp.text 40 | 41 | # get all public fields from the bio 42 | def get_orcidId(orcidId, datatype): 43 | """ 44 | Returns the fields set as "Public" in the bio portion of the ORCID Record 45 | for the scholar represented by the specified orcidId. When used with an 46 | access token and the Member API, limited-access data is also returned. 47 | http://pub.orcid.org/v1.1/0000-0001-7857-2795 48 | """ 49 | return request_api(BASE_URL+API_VERSION+orcidId, datatype) 50 | 51 | # get all public fields from the bio 52 | def get_works(orcidId, datatype): 53 | """ 54 | Returns the "works" research activities that are set as "Public" in the 55 | ORCID Record for the scholar represented by the specified orcidId. When 56 | used with an access token and the Member API, limited-access "works" are 57 | also returned. 58 | http://pub.orcid.org/v1.1/0000-0001-7857-2795/orcid-works 59 | """ 60 | return request_api(BASE_URL+API_VERSION+orcidId+'/orcid-works', datatype) 61 | 62 | # get all public fields from the bio 63 | def get_profile(orcidId, datatype): 64 | """ 65 | Returns the fields set as "Public" in the bio portion of the ORCID Record 66 | for the scholar represented by the specified orcidId. When used with an 67 | access token and the Member API, limited-access data is also returned. 68 | http://pub.orcid.org/v1.1/0000-0001-7857-2795/orcid-profile 69 | """ 70 | return request_api(BASE_URL+API_VERSION+orcidId+'/orcid-profile', datatype) 71 | 72 | def request_ids(orcidIds): 73 | for id in orcidIds: 74 | data[id] = get_profile(id, datatype) 75 | return data 76 | 77 | def save_to_file(data, datatype): 78 | if datatype == 'xml': 79 | for id in data: 80 | at.save_to_file(data[id], rootFolder+'/data/raw/xml/orcid_'+id+'.xml') 81 | 82 | # main 83 | if __name__ == '__main__': 84 | startTime = at.start_timer() 85 | data = {} 86 | rootFolder = at.get_root_folder() 87 | datatype = 'xml' 88 | baseUrl = 'http://pub.orcid.org/' 89 | at.setup_environment() 90 | 91 | df = at.open_csv(rootFolder+'/data/raw/csv/orcid.csv') 92 | 93 | #read_csv 94 | data = request_ids(df['orcid_id']) 95 | 96 | save_to_file(data, datatype) 97 | 98 | at.stop_timer(startTime) 99 | -------------------------------------------------------------------------------- /code/pyApiToolkit.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | """ 6 | 7 | import requests 8 | import json 9 | import csv 10 | import os 11 | import sys 12 | import sqlite3 13 | import pandas as pd 14 | from datetime import datetime 15 | #import xml.etree.ElementTree as ET 16 | #from sqlalchemy import Column, ForeignKey, Integer, String 17 | #from sqlalchemy.exc import SQLAlchemyError 18 | #from sqlalchemy.ext.declarative import declarative_base 19 | #from sqlalchemy.orm import relationship 20 | #from sqlalchemy import create_engine 21 | 22 | 23 | def get_root_folder(): 24 | return os.path.dirname(os.getcwd()) 25 | 26 | def setup_environment(): 27 | """Sets up the folder structure and working environment. 28 | """ 29 | rootFolder = get_root_folder() 30 | folderRawJSON = rootFolder + '/data/raw/json/' 31 | folderRawXML = rootFolder + '/data/raw/xml/' 32 | folderRawCSV = rootFolder + '/data/raw/csv/' 33 | folderSQLite3 = rootFolder + '/data/sqlite3/' 34 | if not os.path.exists(folderRawCSV): 35 | os.makedirs(folderRawCSV) 36 | if not os.path.exists(folderRawJSON): 37 | os.makedirs(folderRawJSON) 38 | if not os.path.exists(folderRawXML): 39 | os.makedirs(folderRawXML) 40 | if not os.path.exists(folderSQLite3): 41 | os.makedirs(folderSQLite3) 42 | 43 | def get_timestring(): 44 | return datetime.now().strftime('%Y-%m-%d-%H-%M') 45 | 46 | def save_to_file(data, filename): 47 | """Saves file on specified place on harddrive. 48 | 49 | Args: 50 | data: string to save. 51 | filename: string of the filepath. 52 | """ 53 | try: 54 | f = open(filename, 'w') 55 | text_file = open(filename, "w") 56 | text_file.write(data) 57 | text_file.close() 58 | except: 59 | print('Error writing', filename) 60 | return False 61 | 62 | 63 | def save_to_json(data, filename): 64 | try: 65 | data = json.dumps(data, indent=2, sort_keys=True) 66 | except: 67 | print('Error opening', filename) 68 | return None 69 | save_to_file(data, filename) 70 | 71 | def read_file(filename): 72 | """Reads file and returns the text. 73 | 74 | Args: 75 | filename: name of the file 76 | 77 | Returns: 78 | string: content of file as string 79 | """ 80 | f = open(filename, 'w') 81 | string = f.read() 82 | 83 | return string 84 | 85 | def create_sqlite3_db(filename): 86 | return sqlite3.connect(filename) 87 | 88 | def execute_sqlite3_query(query): 89 | c = conn.cursor() 90 | c.execute(query) 91 | 92 | def commit_sqlite3(conn): 93 | conn.commit() 94 | 95 | def close_sqlite3_conn(conn): 96 | conn.close() 97 | 98 | def start_timer(): 99 | startTime = datetime.now() 100 | print('start:', startTime) 101 | return startTime 102 | 103 | def stop_timer(startTime): 104 | print('runtime:', (datetime.now() - startTime)) 105 | 106 | def request_query(query): 107 | resp = requests.get(query) 108 | 109 | if resp.status_code != 200: 110 | # This means something went wrong. 111 | raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 112 | 113 | return resp.json() 114 | 115 | def open_csv(filename): 116 | data = [] 117 | df = pd.read_csv(filename) 118 | 119 | return df 120 | 121 | # sqlalchemy http://pythoncentral.io/introductory-tutorial-python-sqlalchemy/ 122 | # http://docs.sqlalchemy.org/en/rel_1_1/ 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | --------------------------------------------------------------------------------