├── .gitignore ├── datakit ├── __init__.py └── datakit.py ├── docker-compose.yml ├── Dockerfile ├── jp └── python │ ├── stock │ └── ni255_daily.py │ ├── bank │ └── all.py │ ├── postalcode │ └── all.py │ ├── population │ └── yearly.py │ ├── land │ └── points_yearly.py │ └── currency │ └── jpy_usd_daily.py ├── README.md └── generate.sh /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | /raw_data 3 | __pycache__/ -------------------------------------------------------------------------------- /datakit/__init__.py: -------------------------------------------------------------------------------- 1 | # from . import datakit 2 | 3 | from datakit.datakit import * -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | importer: 4 | build: . 5 | volumes: 6 | - .:/src 7 | working_dir: /src 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | RUN conda install pandas -y 4 | RUN conda install -c anaconda pytables -y 5 | RUN pip install requests xlrd 6 | 7 | ENV PYTHONPATH $PYTHONPATH:/src 8 | ADD . /src 9 | 10 | CMD ["sh", "-c", "tail -f /dev/null"] -------------------------------------------------------------------------------- /datakit/datakit.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | def download(url, file_path): 4 | r = requests.get(url) 5 | with open(file_path, 'wb') as file: 6 | file.write(r.content) 7 | 8 | import zipfile 9 | def unzip(original_file_path, dist_dir): 10 | with zipfile.ZipFile(original_file_path, 'r') as zip_ref: 11 | zip_ref.extractall(dist_dir) 12 | -------------------------------------------------------------------------------- /jp/python/stock/ni255_daily.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | import os 4 | import requests 5 | import zipfile 6 | 7 | URL = 'http://k-db.com/indices/I101?download=csv' 8 | OUTPUT_FILE_PATH = '/src/data/output.csv' 9 | 10 | # GET raw data 11 | r = requests.get(URL) 12 | with open('/tmp/raw.csv', 'wb') as file: 13 | file.write(r.content) 14 | 15 | # CONVERT data into utf8 CSV 16 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 17 | with codecs.open('/tmp/raw.csv', 'r', 'shiftjis') as input: 18 | output.write(input.read()) 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # datakit 2 | Currently datakit is a assemble of methods to preprocess open data. A preprocessing open data is a something all we do. Even if we share same aim, source codes are not shared so much. It’s nice idea to refrain it as open source project. 3 | 4 | We appreciate these values: 5 | - Latest data 6 | - Cleaned data (unixtime, UTF-8, without duplicate..) 7 | - Support major formats(csv, json, xml, hdf) 8 | - Community based 9 | 10 | # Try it on your local 11 | $ mkdir data 12 | $ docker-compose run importer bash -l -c "python jp/python/bank/all.py" 13 | $ cat data/output.csv 14 | 15 | # feature works 16 | - provide framework for preprocessing data. 17 | - allow user to select which and how data column will be composed. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /jp/python/bank/all.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | import os 4 | import datakit 5 | 6 | URL = 'http://ykaku.com/ginkokensaku/ginkositen.zip' 7 | OUTPUT_FILE_PATH = '/src/data/output.csv' 8 | 9 | # GET raw data 10 | datakit.download(URL, '/tmp/raw.zip') 11 | 12 | # UNZIP data 13 | datakit.unzip('/tmp/raw.zip', '/tmp') 14 | 15 | # CONVERT data into CSV 16 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 17 | spamwriter = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 18 | with codecs.open('/tmp/ginkositen.txt', 'r', 'shiftjis') as input: 19 | spamreader = csv.reader(input, delimiter=',', quotechar='"') 20 | for row in spamreader: 21 | spamwriter.writerow([x.strip() for x in row]) 22 | -------------------------------------------------------------------------------- /generate.sh: -------------------------------------------------------------------------------- 1 | mkdir data 2 | 3 | docker-compose run importer bash -l -c "python jp/python/bank/all.py" 4 | mv data/output.csv data/bank_all.csv 5 | 6 | docker-compose run importer bash -l -c "python jp/python/currency/jpy_usd_daily.py" 7 | mv data/output.csv data/currency_jpy_usd_daily.csv 8 | 9 | docker-compose run importer bash -l -c "python jp/python/land/points_yearly.py" 10 | mv data/output.csv data/land_points_yearly.csv 11 | 12 | docker-compose run importer bash -l -c "python jp/python/population/yearly.py" 13 | mv data/output.csv data/population_yearly.csv 14 | 15 | docker-compose run importer bash -l -c "python jp/python/postalcode/all.py" 16 | mv data/output.csv data/postalcode_all.csv 17 | 18 | docker-compose run importer bash -l -c "python jp/python/stock/ni255_daily.py" 19 | mv data/output.csv data/stock_ni255_daily.csv -------------------------------------------------------------------------------- /jp/python/postalcode/all.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | import os 4 | import requests 5 | import zipfile 6 | 7 | URL = 'http://www.post.japanpost.jp/zipcode/dl/kogaki/zip/ken_all.zip' 8 | OUTPUT_FILE_PATH = '/src/data/output.csv' 9 | 10 | # GET raw data 11 | r = requests.get(URL) 12 | with open('/tmp/raw.zip', 'wb') as file: 13 | file.write(r.content) 14 | 15 | # UNZIP data 16 | with zipfile.ZipFile('/tmp/raw.zip', 'r') as zip_ref: 17 | zip_ref.extractall('/tmp') 18 | 19 | # CONVERT data into CSV 20 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 21 | spamwriter = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 22 | with codecs.open('/tmp/KEN_ALL.CSV', 'r', 'shiftjis') as input: 23 | spamreader = csv.reader(input, delimiter=',', quotechar='"') 24 | for row in spamreader: 25 | spamwriter.writerow([x.strip() for x in row]) 26 | -------------------------------------------------------------------------------- /jp/python/population/yearly.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | import os 4 | import requests 5 | import xlrd 6 | import numpy as np 7 | 8 | 9 | # Download (参考表)全国人口の推移 from http://www.e-stat.go.jp/SG1/estat/List.do?lid=000001189063 10 | DOWNLOAD_FILE = "./raw_data/05k2-2.xls" 11 | OUTPUT_FILE_PATH = '/src/data/output.csv' 12 | 13 | wb = xlrd.open_workbook(DOWNLOAD_FILE) 14 | sh = wb.sheet_by_index(0) 15 | rows = [] 16 | for rx in range(sh.nrows): 17 | if 13 <= rx <= 19: 18 | rows.append(sh.row(rx)) 19 | 20 | rows = [[x.value for x in row] for row in rows] 21 | 22 | # CONVERT data into CSV 23 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 24 | spamwriter = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 25 | spamwriter.writerow(['年', '月初人口', '増減数', '増減率', '出生児数', '死亡者数', '自然増減', '入国者数', '出国者数', '社会増減']) 26 | for row in rows: 27 | spamwriter.writerow([row[i] for i in [1,3,5,6,7,8,9,10,11,12]]) 28 | -------------------------------------------------------------------------------- /jp/python/land/points_yearly.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | import os 4 | import requests 5 | import zipfile 6 | 7 | # Download from http://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-L01-v2_3.html 8 | DOWNLOAD_FILE = './raw_data/L01-29P-48-01.0a.zip' 9 | OUTPUT_FILE_PATH = '/src/data/output.csv' 10 | 11 | # UNZIP data 12 | with zipfile.ZipFile(DOWNLOAD_FILE, 'r') as zip_ref: 13 | zip_ref.extractall('/tmp') 14 | 15 | # CONVERT data into CSV 16 | 17 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 18 | spamwriter = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 19 | with codecs.open('/tmp/L01-29P-2K.csv' , 'r', 'shiftjis') as input: 20 | try: 21 | spamreader = csv.reader(input, delimiter=',', quotechar='"') 22 | for row in spamreader: 23 | spamwriter.writerow([x.strip() for x in row]) 24 | except UnicodeDecodeError: 25 | print("UnicodeDecodeError") 26 | -------------------------------------------------------------------------------- /jp/python/currency/jpy_usd_daily.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import csv 3 | from datetime import datetime 4 | import os 5 | import requests 6 | import zipfile 7 | 8 | # '2014-12-23' => '1419292800' 9 | def str2unixtime(s): 10 | try: 11 | return datetime.strptime(s, '%Y-%m-%d').strftime("%s") 12 | except ValueError: 13 | print('ValueError') 14 | '' 15 | 16 | # Please update your quandl key 17 | URL = 'https://www.quandl.com/api/v3/datasets/CUR/JPY.csv?api_key=mCkqGja_5orzQJxF5RhQ' 18 | OUTPUT_FILE_PATH = '/src/data/output.csv' 19 | 20 | # GET raw data 21 | r = requests.get(URL) 22 | with open('/tmp/raw.csv', 'wb') as file: 23 | file.write(r.content) 24 | 25 | # CONVERT data into CSV along with converting date to unixtime 26 | with open(OUTPUT_FILE_PATH, 'w', newline='') as output: 27 | spamwriter = csv.writer(output, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 28 | with codecs.open('/tmp/raw.csv', 'r') as input: 29 | spamreader = csv.reader(input, delimiter=',', quotechar='"') 30 | for row in spamreader: 31 | spamwriter.writerow([str2unixtime(row[0]), row[1]]) 32 | --------------------------------------------------------------------------------