├── 5-5.ipynb ├── 2-2-1.ipynb ├── check_version.ipynb ├── 3.ipynb ├── 5-1.ipynb ├── explain_loc_iloc_get_loc.ipynb ├── 7-1.ipynb ├── 5-6.ipynb └── 2-2-2.ipynb /5-5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例5-5. 特徴量ハッシング(別名「ハッシングトリック」)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import json\n", 18 | "\n", 19 | "# 最初の10,000件のレビューを読み込み\n", 20 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n", 21 | " js = []\n", 22 | " for i in range(10000):\n", 23 | " js.append(json.loads(f.readline()))\n", 24 | "\n", 25 | "review_df = pd.DataFrame(js)\n", 26 | "# mにbusiness_idのユニーク数を代入\n", 27 | "m = len(review_df['business_id'].unique())\n", 28 | "\n", 29 | "m" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "['9yKzy9PApeiPPOUJEtnvkg',\n", 41 | " 'ZRJwVLyzEJq1VAihDhYiow',\n", 42 | " '6oRAC4uyJCsJl1X0WZpVSA',\n", 43 | " '_1QQZuf4zZOyFCvXc0o6Vg',\n", 44 | " '6ozycU1RpktNG2-1BroVtw']" 45 | ] 46 | }, 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "from sklearn.feature_extraction import FeatureHasher\n", 54 | "h = FeatureHasher(n_features=m, input_type='string')\n", 55 | "f = h.transform(review_df['business_id'])\n", 56 | "\n", 57 | "# 変換後の特徴量が解釈が困難であることを確認\n", 58 | "review_df['business_id'].unique().tolist()[0:5]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 7, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "array([[0., 0., 0., ..., 0., 0., 0.],\n", 70 | " [0., 0., 0., ..., 0., 0., 0.],\n", 71 | " [0., 0., 0., ..., 0., 0., 0.],\n", 72 | " ...,\n", 73 | " [0., 0., 0., ..., 0., 0., 0.],\n", 74 | " [0., 0., 0., ..., 0., 0., 0.],\n", 75 | " [0., 0., 0., ..., 0., 0., 0.]])" 76 | ] 77 | }, 78 | "execution_count": 7, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "f.toarray()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 8, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Our pandas Series, in bytes: 790104\n", 97 | "Our hashed numpy array, in bytes: 56\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "# 変換後の特徴量のストレージサイズが大きく減っていることを確認\n", 103 | "from sys import getsizeof\n", 104 | "print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id']))\n", 105 | "print('Our hashed numpy array, in bytes: ', getsizeof(f))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.7.0" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 2 137 | } 138 | -------------------------------------------------------------------------------- /2-2-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例2-1. The Echo Nest データセットの再生回数の二値化" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "
\n", 19 | "\n", 32 | "\n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
012
0b80344d063b5ccb3212f76538f3d9e43d87dca9eSOAKIMP12A8C1309951
1b80344d063b5ccb3212f76538f3d9e43d87dca9eSOAPDEY12A81C210A91
2b80344d063b5ccb3212f76538f3d9e43d87dca9eSOBBMDR12A8C13253B1
3b80344d063b5ccb3212f76538f3d9e43d87dca9eSOBFNSP12AF72A0E221
4b80344d063b5ccb3212f76538f3d9e43d87dca9eSOBFOVM12A58A7D4941
\n", 74 | "
" 75 | ], 76 | "text/plain": [ 77 | " 0 1 2\n", 78 | "0 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAKIMP12A8C130995 1\n", 79 | "1 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAPDEY12A81C210A9 1\n", 80 | "2 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBBMDR12A8C13253B 1\n", 81 | "3 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFNSP12AF72A0E22 1\n", 82 | "4 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFOVM12A58A7D494 1" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "import pandas as pd\n", 92 | "listen_count = pd.read_csv('data/millionsong/train_triplets.txt.zip', header=None, delimiter='\\t', compression='zip')\n", 93 | "\n", 94 | "# このデータはユーザID、曲ID、再生回数の3つの列で構成されます。\n", 95 | "# 再生回数 0 を含まないため、単に再生回数の列をすべて 1 で上書きすることで\n", 96 | "# 再生回数を二値化できます。\n", 97 | "listen_count.iloc[:, 2] = 1\n", 98 | "\n", 99 | "listen_count.head()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.7.0" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | -------------------------------------------------------------------------------- /check_version.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'1.15.3'" 12 | ] 13 | }, 14 | "execution_count": 21, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import numpy as np\n", 21 | "np.version.full_version" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 22, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'1.1.0'" 33 | ] 34 | }, 35 | "execution_count": 22, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "import scipy as sp\n", 42 | "sp.version.full_version" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 23, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "'3.0.0'" 54 | ] 55 | }, 56 | "execution_count": 23, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "import matplotlib\n", 63 | "matplotlib.__version__" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 24, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "'0.20.0'" 75 | ] 76 | }, 77 | "execution_count": 24, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "import sklearn\n", 84 | "sklearn.__version__" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 20, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "\n", 97 | "INSTALLED VERSIONS\n", 98 | "------------------\n", 99 | "commit: None\n", 100 | "python: 3.7.0.final.0\n", 101 | "python-bits: 64\n", 102 | "OS: Darwin\n", 103 | "OS-release: 17.7.0\n", 104 | "machine: x86_64\n", 105 | "processor: i386\n", 106 | "byteorder: little\n", 107 | "LC_ALL: None\n", 108 | "LANG: ja_JP.UTF-8\n", 109 | "LOCALE: ja_JP.UTF-8\n", 110 | "\n", 111 | "pandas: 0.23.4\n", 112 | "pytest: None\n", 113 | "pip: 18.0\n", 114 | "setuptools: 40.2.0\n", 115 | "Cython: None\n", 116 | "numpy: 1.15.3\n", 117 | "scipy: 1.1.0\n", 118 | "pyarrow: None\n", 119 | "xarray: None\n", 120 | "IPython: 7.0.1\n", 121 | "sphinx: None\n", 122 | "patsy: None\n", 123 | "dateutil: 2.7.3\n", 124 | "pytz: 2018.5\n", 125 | "blosc: None\n", 126 | "bottleneck: None\n", 127 | "tables: None\n", 128 | "numexpr: None\n", 129 | "feather: None\n", 130 | "matplotlib: 3.0.0\n", 131 | "openpyxl: None\n", 132 | "xlrd: None\n", 133 | "xlwt: None\n", 134 | "xlsxwriter: None\n", 135 | "lxml: None\n", 136 | "bs4: None\n", 137 | "html5lib: None\n", 138 | "sqlalchemy: None\n", 139 | "pymysql: None\n", 140 | "psycopg2: None\n", 141 | "jinja2: 2.10\n", 142 | "s3fs: None\n", 143 | "fastparquet: None\n", 144 | "pandas_gbq: None\n", 145 | "pandas_datareader: None\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "import pandas as pd\n", 151 | "pd.show_versions()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.7.0" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例3-1. nグラムの計算" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "29222 368943 881620\n" 20 | ] 21 | }, 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03']" 26 | ] 27 | }, 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "import pandas as pd\n", 35 | "import json\n", 36 | "from sklearn.feature_extraction.text import CountVectorizer\n", 37 | "\n", 38 | "# 最初の 10,000 件のレビューを読み込む\n", 39 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n", 40 | " js = []\n", 41 | " for i in range(10000):\n", 42 | " js.append(json.loads(f.readline()))\n", 43 | "review_df = pd.DataFrame(js)\n", 44 | "\n", 45 | "# scikit-learn の CountVectorizer を使ってユニグラム(BoW)、\n", 46 | "# バイグラム、トライグラムの特徴量変換器を作成する。\n", 47 | "# CountVectorizer はデフォルトでは1文字の単語を無視するが、\n", 48 | "# これは意味のない単語を除外するため実用的である。\n", 49 | "# ただしここでは全ての単語を含むように設定している。\n", 50 | "bow_converter = CountVectorizer(token_pattern='(?u)\\\\b\\\\w+\\\\b')\n", 51 | "bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\\\b\\\\w+\\\\b')\n", 52 | "trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\\\b\\\\w+\\\\b')\n", 53 | "\n", 54 | "# 変換器を適用し、語彙数を確認する\n", 55 | "bow_converter.fit(review_df['text'])\n", 56 | "words = bow_converter.get_feature_names()\n", 57 | "\n", 58 | "bigram_converter.fit(review_df['text'])\n", 59 | "bigrams = bigram_converter.get_feature_names()\n", 60 | "\n", 61 | "trigram_converter.fit(review_df['text'])\n", 62 | "trigrams = trigram_converter.get_feature_names()\n", 63 | "\n", 64 | "print (len(words), len(bigrams), len(trigrams))\n", 65 | "\n", 66 | "# n-グラムを確認する\n", 67 | "words[:10]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "['zuzu was',\n", 79 | " 'zuzus room',\n", 80 | " 'zweigel wine',\n", 81 | " 'zwiebel kräuter',\n", 82 | " 'zy world',\n", 83 | " 'zzed in',\n", 84 | " 'éclairs napoleons',\n", 85 | " 'école lenôtre',\n", 86 | " 'ém all',\n", 87 | " 'òc châm']" 88 | ] 89 | }, 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "bigrams[-10:]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "['0 0 eye',\n", 108 | " '0 20 less',\n", 109 | " '0 39 oz',\n", 110 | " '0 39 pizza',\n", 111 | " '0 5 i',\n", 112 | " '0 50 to',\n", 113 | " '0 6 can',\n", 114 | " '0 75 oysters',\n", 115 | " '0 75 that',\n", 116 | " '0 75 to']" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "trigrams[:10]" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## 例3-2. 品詞タグ付けとチャンク化" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 2, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "['General', 'PROPN', 'NNP']\n", 145 | "['Manager', 'PROPN', 'NNP']\n", 146 | "['Scott', 'PROPN', 'NNP']\n", 147 | "['Petello', 'PROPN', 'NNP']\n", 148 | "['is', 'VERB', 'VBZ']\n", 149 | "['a', 'DET', 'DT']\n", 150 | "['good', 'ADJ', 'JJ']\n", 151 | "['egg', 'NOUN', 'NN']\n", 152 | "['!', 'PUNCT', '.']\n", 153 | "['!', 'PUNCT', '.']\n", 154 | "['!', 'PUNCT', '.']\n", 155 | "['Not', 'ADV', 'RB']\n", 156 | "['to', 'PART', 'TO']\n", 157 | "['go', 'VERB', 'VB']\n", 158 | "['into', 'ADP', 'IN']\n", 159 | "['detail', 'NOUN', 'NN']\n", 160 | "[',', 'PUNCT', ',']\n", 161 | "['but', 'CCONJ', 'CC']\n", 162 | "['let', 'VERB', 'VB']\n", 163 | "['me', 'PRON', 'PRP']\n", 164 | "['assure', 'VERB', 'VB']\n", 165 | "['you', 'PRON', 'PRP']\n", 166 | "['if', 'ADP', 'IN']\n", 167 | "['you', 'PRON', 'PRP']\n", 168 | "['have', 'VERB', 'VBP']\n", 169 | "['any', 'DET', 'DT']\n", 170 | "['issues', 'NOUN', 'NNS']\n", 171 | "['(', 'PUNCT', '-LRB-']\n", 172 | "['albeit', 'ADP', 'IN']\n", 173 | "['rare', 'ADJ', 'JJ']\n", 174 | "[')', 'PUNCT', '-RRB-']\n", 175 | "['speak', 'VERB', 'VBP']\n", 176 | "['with', 'ADP', 'IN']\n", 177 | "['Scott', 'PROPN', 'NNP']\n", 178 | "['and', 'CCONJ', 'CC']\n", 179 | "['treat', 'VERB', 'VB']\n", 180 | "['the', 'DET', 'DT']\n", 181 | "['guy', 'NOUN', 'NN']\n", 182 | "['with', 'ADP', 'IN']\n", 183 | "['some', 'DET', 'DT']\n", 184 | "['respect', 'NOUN', 'NN']\n", 185 | "['as', 'ADP', 'IN']\n", 186 | "['you', 'PRON', 'PRP']\n", 187 | "['state', 'VERB', 'VBP']\n", 188 | "['your', 'ADJ', 'PRP$']\n", 189 | "['case', 'NOUN', 'NN']\n", 190 | "['and', 'CCONJ', 'CC']\n", 191 | "['I', 'PRON', 'PRP']\n", 192 | "[\"'d\", 'VERB', 'MD']\n", 193 | "['be', 'VERB', 'VB']\n", 194 | "['surprised', 'ADJ', 'JJ']\n", 195 | "['if', 'ADP', 'IN']\n", 196 | "['you', 'PRON', 'PRP']\n", 197 | "['do', 'VERB', 'VBP']\n", 198 | "[\"n't\", 'ADV', 'RB']\n", 199 | "['walk', 'VERB', 'VB']\n", 200 | "['out', 'ADV', 'RB']\n", 201 | "['totally', 'ADV', 'RB']\n", 202 | "['satisfied', 'ADJ', 'JJ']\n", 203 | "['as', 'ADP', 'IN']\n", 204 | "['I', 'PRON', 'PRP']\n", 205 | "['just', 'ADV', 'RB']\n", 206 | "['did', 'VERB', 'VBD']\n", 207 | "['.', 'PUNCT', '.']\n", 208 | "['Like', 'INTJ', 'UH']\n", 209 | "['I', 'PRON', 'PRP']\n", 210 | "['always', 'ADV', 'RB']\n", 211 | "['say', 'VERB', 'VBP']\n", 212 | "['.....', 'PUNCT', 'NFP']\n", 213 | "['\"', 'PUNCT', '``']\n", 214 | "['Mistakes', 'NOUN', 'NNS']\n", 215 | "['are', 'VERB', 'VBP']\n", 216 | "['inevitable', 'ADJ', 'JJ']\n", 217 | "[',', 'PUNCT', ',']\n", 218 | "['it', 'PRON', 'PRP']\n", 219 | "[\"'s\", 'VERB', 'VBZ']\n", 220 | "['how', 'ADV', 'WRB']\n", 221 | "['we', 'PRON', 'PRP']\n", 222 | "['recover', 'VERB', 'VBP']\n", 223 | "['from', 'ADP', 'IN']\n", 224 | "['them', 'PRON', 'PRP']\n", 225 | "['that', 'ADJ', 'WDT']\n", 226 | "['is', 'VERB', 'VBZ']\n", 227 | "['important', 'ADJ', 'JJ']\n", 228 | "['\"', 'PUNCT', \"''\"]\n", 229 | "['!', 'PUNCT', '.']\n", 230 | "['!', 'PUNCT', '.']\n", 231 | "['!', 'PUNCT', '.']\n", 232 | "['\\n\\n', 'SPACE', '_SP']\n", 233 | "['Thanks', 'NOUN', 'NNS']\n", 234 | "['to', 'ADP', 'IN']\n", 235 | "['Scott', 'PROPN', 'NNP']\n", 236 | "['and', 'CCONJ', 'CC']\n", 237 | "['his', 'ADJ', 'PRP$']\n", 238 | "['awesome', 'ADJ', 'JJ']\n", 239 | "['staff', 'NOUN', 'NN']\n", 240 | "['.', 'PUNCT', '.']\n", 241 | "['You', 'PRON', 'PRP']\n", 242 | "[\"'ve\", 'VERB', 'VB']\n", 243 | "['got', 'VERB', 'VBN']\n", 244 | "['a', 'DET', 'DT']\n", 245 | "['customer', 'NOUN', 'NN']\n", 246 | "['for', 'ADP', 'IN']\n", 247 | "['life', 'NOUN', 'NN']\n", 248 | "['!', 'PUNCT', '.']\n", 249 | "['!', 'PUNCT', '.']\n", 250 | "['..........', 'PUNCT', 'NFP']\n", 251 | "[':', 'PUNCT', ':']\n", 252 | "['^', 'PUNCT', 'NFP']\n", 253 | "[')', 'PUNCT', '-RRB-']\n", 254 | "[General Manager Scott Petello, a good egg, detail, me, you, you, any issues, Scott, the guy, some respect, you, your case, I, you, I, I, Mistakes, it, we, them, Thanks, Scott, his awesome staff, You, a customer, life]\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "import pandas as pd\n", 260 | "import json\n", 261 | "\n", 262 | "# 最初の10レビューを読み込む\n", 263 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n", 264 | " js = []\n", 265 | " for i in range(10):\n", 266 | " js.append(json.loads(f.readline()))\n", 267 | "review_df = pd.DataFrame(js)\n", 268 | "\n", 269 | "# まずは Spacy を使った方法\n", 270 | "import spacy\n", 271 | "# 言語モデル(英語)を読み込む\n", 272 | "nlp = spacy.load('en')\n", 273 | "\n", 274 | "# spaCy の言語モデルを使ってテキストから Pandas Series を作成する\n", 275 | "doc_df = review_df['text'].apply(nlp)\n", 276 | "\n", 277 | "# spaCy は細かい品詞タグを .pos_ で、粗い品詞タグを .tag_ で提供します\n", 278 | "for doc in doc_df[4]:\n", 279 | " print([doc.text, doc.pos_, doc.tag_])\n", 280 | "\n", 281 | "# spaCy は基本的な名詞句も .noun_chunks で提供します\n", 282 | "print([chunk for chunk in doc_df[4].noun_chunks])" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 7, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "[('General', 'NNP'),\n", 294 | " ('Manager', 'NNP'),\n", 295 | " ('Scott', 'NNP'),\n", 296 | " ('Petello', 'NNP'),\n", 297 | " ('is', 'VBZ'),\n", 298 | " ('a', 'DT'),\n", 299 | " ('good', 'JJ'),\n", 300 | " ('egg', 'NN'),\n", 301 | " ('Not', 'RB'),\n", 302 | " ('to', 'TO'),\n", 303 | " ('go', 'VB'),\n", 304 | " ('into', 'IN'),\n", 305 | " ('detail', 'NN'),\n", 306 | " ('but', 'CC'),\n", 307 | " ('let', 'VB'),\n", 308 | " ('me', 'PRP'),\n", 309 | " ('assure', 'VB'),\n", 310 | " ('you', 'PRP'),\n", 311 | " ('if', 'IN'),\n", 312 | " ('you', 'PRP'),\n", 313 | " ('have', 'VBP'),\n", 314 | " ('any', 'DT'),\n", 315 | " ('issues', 'NNS'),\n", 316 | " ('albeit', 'IN'),\n", 317 | " ('rare', 'NN'),\n", 318 | " ('speak', 'NN'),\n", 319 | " ('with', 'IN'),\n", 320 | " ('Scott', 'NNP'),\n", 321 | " ('and', 'CC'),\n", 322 | " ('treat', 'VB'),\n", 323 | " ('the', 'DT'),\n", 324 | " ('guy', 'NN'),\n", 325 | " ('with', 'IN'),\n", 326 | " ('some', 'DT'),\n", 327 | " ('respect', 'NN'),\n", 328 | " ('as', 'IN'),\n", 329 | " ('you', 'PRP'),\n", 330 | " ('state', 'NN'),\n", 331 | " ('your', 'PRP$'),\n", 332 | " ('case', 'NN'),\n", 333 | " ('and', 'CC'),\n", 334 | " ('I', 'PRP'),\n", 335 | " (\"'d\", 'MD'),\n", 336 | " ('be', 'VB'),\n", 337 | " ('surprised', 'VBN'),\n", 338 | " ('if', 'IN'),\n", 339 | " ('you', 'PRP'),\n", 340 | " ('do', 'VBP'),\n", 341 | " (\"n't\", 'RB'),\n", 342 | " ('walk', 'VB'),\n", 343 | " ('out', 'RP'),\n", 344 | " ('totally', 'RB'),\n", 345 | " ('satisfied', 'JJ'),\n", 346 | " ('as', 'IN'),\n", 347 | " ('I', 'PRP'),\n", 348 | " ('just', 'RB'),\n", 349 | " ('did', 'VBD'),\n", 350 | " ('Like', 'IN'),\n", 351 | " ('I', 'PRP'),\n", 352 | " ('always', 'RB'),\n", 353 | " ('say', 'VBP'),\n", 354 | " ('..', 'VBP'),\n", 355 | " ('Mistakes', 'NNS'),\n", 356 | " ('are', 'VBP'),\n", 357 | " ('inevitable', 'JJ'),\n", 358 | " ('it', 'PRP'),\n", 359 | " (\"'s\", 'VBZ'),\n", 360 | " ('how', 'WRB'),\n", 361 | " ('we', 'PRP'),\n", 362 | " ('recover', 'VBP'),\n", 363 | " ('from', 'IN'),\n", 364 | " ('them', 'PRP'),\n", 365 | " ('that', 'WDT'),\n", 366 | " ('is', 'VBZ'),\n", 367 | " ('important', 'JJ'),\n", 368 | " ('Thanks', 'NNS'),\n", 369 | " ('to', 'TO'),\n", 370 | " ('Scott', 'NNP'),\n", 371 | " ('and', 'CC'),\n", 372 | " ('his', 'PRP$'),\n", 373 | " ('awesome', 'JJ'),\n", 374 | " ('staff', 'NN'),\n", 375 | " ('You', 'PRP'),\n", 376 | " (\"'ve\", 'VBP'),\n", 377 | " ('got', 'VBN'),\n", 378 | " ('a', 'DT'),\n", 379 | " ('customer', 'NN'),\n", 380 | " ('for', 'IN'),\n", 381 | " ('life', 'NN'),\n", 382 | " ('^', 'NN')]" 383 | ] 384 | }, 385 | "execution_count": 7, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "# TextBlob ライブラリを使って同じことができる\n", 392 | "from textblob import TextBlob\n", 393 | "\n", 394 | "# TextBlob はデフォルトでは PatternTagger を使ってタグ付けを行う。\n", 395 | "# これは今回の例ではうまくいくが、文法の正しくない文章を含む場合は \n", 396 | "# NLTKTagger を使うことをおすすめする。\n", 397 | "blob_df = review_df['text'].apply(TextBlob)\n", 398 | "\n", 399 | "blob_df[4].tags" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 8, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "['general manager', 'scott petello', 'good egg', 'scott', \"n't walk\", '... ..', 'mistakes', 'thanks', 'scott', 'awesome staff', '... ... ...']\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "print([np for np in blob_df[4].noun_phrases])" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "Python 3", 430 | "language": "python", 431 | "name": "python3" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 3 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython3", 443 | "version": "3.7.0" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 2 448 | } 449 | -------------------------------------------------------------------------------- /5-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例5-1. One-Hotエンコーディングとダミーコーディングを利用した線形回帰モデリング" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/plain": [ 18 | "3333.3333333333335" 19 | ] 20 | }, 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "output_type": "execute_result" 24 | } 25 | ], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "from sklearn import linear_model\n", 29 | "\n", 30 | "# 3つの都市におけるアパートの家賃のデータセットを設定\n", 31 | "df = pd.DataFrame({\n", 32 | " 'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC', 'Seattle', 'Seattle', 'Seattle'],\n", 33 | " 'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]\n", 34 | "})\n", 35 | "\n", 36 | "df['Rent'].mean()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | "
Rentcity_NYCcity_SFcity_Seattle
03999010
14000010
24001010
33499100
43500100
53501100
62499001
72500001
82501001
\n", 137 | "
" 138 | ], 139 | "text/plain": [ 140 | " Rent city_NYC city_SF city_Seattle\n", 141 | "0 3999 0 1 0\n", 142 | "1 4000 0 1 0\n", 143 | "2 4001 0 1 0\n", 144 | "3 3499 1 0 0\n", 145 | "4 3500 1 0 0\n", 146 | "5 3501 1 0 0\n", 147 | "6 2499 0 0 1\n", 148 | "7 2500 0 0 1\n", 149 | "8 2501 0 0 1" 150 | ] 151 | }, 152 | "execution_count": 2, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "# One-Hotエンコーディングをカテゴリ値であるcity列に適用\n", 159 | "# 特徴量をOne-Hotエンコーディングで生成した列に、ターゲット変数を家賃に指定し、線形回帰モデルを学習\n", 160 | "one_hot_df = pd.get_dummies(df, prefix=['city'])\n", 161 | "one_hot_df" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 6, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "array([ 166.66666667, 666.66666667, -833.33333333])" 173 | ] 174 | }, 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "model = linear_model.LinearRegression()\n", 182 | "model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']], one_hot_df['Rent'])\n", 183 | "model.coef_" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "3333.3333333333335" 195 | ] 196 | }, 197 | "execution_count": 7, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "model.intercept_" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 8, 209 | "metadata": { 210 | "scrolled": true 211 | }, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | "
Rentcity_SFcity_Seattle
0399910
1400010
2400110
3349900
4350000
5350100
6249901
7250001
8250101
\n", 296 | "
" 297 | ], 298 | "text/plain": [ 299 | " Rent city_SF city_Seattle\n", 300 | "0 3999 1 0\n", 301 | "1 4000 1 0\n", 302 | "2 4001 1 0\n", 303 | "3 3499 0 0\n", 304 | "4 3500 0 0\n", 305 | "5 3501 0 0\n", 306 | "6 2499 0 1\n", 307 | "7 2500 0 1\n", 308 | "8 2501 0 1" 309 | ] 310 | }, 311 | "execution_count": 8, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "# ダミーコーディングを利用して線形回帰モデルを学習\n", 318 | "dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)\n", 319 | "dummy_df" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 9, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "array([ 500., -1000.])" 331 | ] 332 | }, 333 | "execution_count": 9, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])\n", 340 | "model.coef_" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 10, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "data": { 350 | "text/plain": [ 351 | "3500.0" 352 | ] 353 | }, 354 | "execution_count": 10, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "model.intercept_" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## 例5-2. Effectコーディングを用いた線形回帰モデル" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 11, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/html": [ 378 | "
\n", 379 | "\n", 392 | "\n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | "
Rentcity_SFcity_Seattle
039991.00.0
140001.00.0
240011.00.0
33499-1.0-1.0
43500-1.0-1.0
53501-1.0-1.0
624990.01.0
725000.01.0
825010.01.0
\n", 458 | "
" 459 | ], 460 | "text/plain": [ 461 | " Rent city_SF city_Seattle\n", 462 | "0 3999 1.0 0.0\n", 463 | "1 4000 1.0 0.0\n", 464 | "2 4001 1.0 0.0\n", 465 | "3 3499 -1.0 -1.0\n", 466 | "4 3500 -1.0 -1.0\n", 467 | "5 3501 -1.0 -1.0\n", 468 | "6 2499 0.0 1.0\n", 469 | "7 2500 0.0 1.0\n", 470 | "8 2501 0.0 1.0" 471 | ] 472 | }, 473 | "execution_count": 11, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "effect_df = dummy_df.copy()\n", 480 | "effect_df.loc[3:5, ['city_SF', 'city_Seattle']] = -1.0\n", 481 | "effect_df" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 12, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "array([ 666.66666667, -833.33333333])" 493 | ] 494 | }, 495 | "execution_count": 12, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])\n", 502 | "model.coef_" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 13, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "3333.3333333333335" 514 | ] 515 | }, 516 | "execution_count": 13, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "model.intercept_" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [] 531 | } 532 | ], 533 | "metadata": { 534 | "kernelspec": { 535 | "display_name": "Python 3", 536 | "language": "python", 537 | "name": "python3" 538 | }, 539 | "language_info": { 540 | "codemirror_mode": { 541 | "name": "ipython", 542 | "version": 3 543 | }, 544 | "file_extension": ".py", 545 | "mimetype": "text/x-python", 546 | "name": "python", 547 | "nbconvert_exporter": "python", 548 | "pygments_lexer": "ipython3", 549 | "version": "3.7.0" 550 | } 551 | }, 552 | "nbformat": 4, 553 | "nbformat_minor": 2 554 | } 555 | -------------------------------------------------------------------------------- /explain_loc_iloc_get_loc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | "
bab
10123
11456
11789
\n", 55 | "
" 56 | ], 57 | "text/plain": [ 58 | " b a b\n", 59 | "10 1 2 3\n", 60 | "11 4 5 6\n", 61 | "11 7 8 9" 62 | ] 63 | }, 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "import pandas as pd\n", 71 | "\n", 72 | "# DataFrame基礎知識\n", 73 | "# pandasにはIndexクラスがある\n", 74 | "# pandas.DataFrameには、indexとcolumnsというプロパティがあり、\n", 75 | "# indexとcolumnsプロパティはIndexクラスが設定される\n", 76 | "# pandas.Index https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.html\n", 77 | "\n", 78 | "df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=[10, 11, 11],columns=['b', 'a', 'b'])\n", 79 | "df" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 21, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "b 1\n", 91 | "a 2\n", 92 | "b 3\n", 93 | "Name: 10, dtype: int64" 94 | ] 95 | }, 96 | "execution_count": 21, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "# locは、1次元の場合、index値で指定(pd.DataFrameは、indexとcolumnsというプロパティを持っている)\n", 103 | "# 指定する値が1つで、該当件数が1件の場合pd.Seriesで返される\n", 104 | "df.loc[10]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/html": [ 115 | "
\n", 116 | "\n", 129 | "\n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
bab
10123
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " b a b\n", 151 | "10 1 2 3" 152 | ] 153 | }, 154 | "execution_count": 4, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "# locは、1次元の場合、index値で指定\n", 161 | "# 指定する値がリストの場合は、該当件数が1件でもpd.DataFrameで返される\n", 162 | "df.loc[[10]]" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 5, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/html": [ 173 | "
\n", 174 | "\n", 187 | "\n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | "
bab
11456
11789
\n", 211 | "
" 212 | ], 213 | "text/plain": [ 214 | " b a b\n", 215 | "11 4 5 6\n", 216 | "11 7 8 9" 217 | ] 218 | }, 219 | "execution_count": 5, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "# locは、1次元の場合、index値で指定\n", 226 | "# 指定する値が1つでも、該当件数が2件以上の場合pd.DataFrameで返される\n", 227 | "df.loc[11]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 6, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "10 2\n", 239 | "11 5\n", 240 | "11 8\n", 241 | "Name: a, dtype: int64" 242 | ] 243 | }, 244 | "execution_count": 6, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n", 251 | "# 該当件数が1件の場合は、pd.Seriesになる\n", 252 | "df.loc[:, 'a']" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 7, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/html": [ 263 | "
\n", 264 | "\n", 277 | "\n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | "
a
102
115
118
\n", 299 | "
" 300 | ], 301 | "text/plain": [ 302 | " a\n", 303 | "10 2\n", 304 | "11 5\n", 305 | "11 8" 306 | ] 307 | }, 308 | "execution_count": 7, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n", 315 | "# リストで指定した場合は、該当件数が1件でもpd.DataFrameになる\n", 316 | "df.loc[:, ['a']]" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 8, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/html": [ 327 | "
\n", 328 | "\n", 341 | "\n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | "
bb
1013
1146
1179
\n", 367 | "
" 368 | ], 369 | "text/plain": [ 370 | " b b\n", 371 | "10 1 3\n", 372 | "11 4 6\n", 373 | "11 7 9" 374 | ] 375 | }, 376 | "execution_count": 8, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n", 383 | "# 指定する値が1つでも、該当件数が2件以上の場合pd.DataFrameで返される\n", 384 | "\n" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 17, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "2" 396 | ] 397 | }, 398 | "execution_count": 17, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n", 405 | "# 返り値が1つだと値の型に応じた返り値になる\n", 406 | "df.loc[10, 'a']" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 18, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "numpy.int64" 418 | ] 419 | }, 420 | "execution_count": 18, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "type(df.loc[10, 'a'])" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 58, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "b 1\n", 438 | "a 2\n", 439 | "b 3\n", 440 | "Name: 10, dtype: int64" 441 | ] 442 | }, 443 | "execution_count": 58, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "# ilocは、1次元の場合、行番号で指定\n", 450 | "df.iloc[0]" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 59, 456 | "metadata": {}, 457 | "outputs": [ 458 | { 459 | "data": { 460 | "text/html": [ 461 | "
\n", 462 | "\n", 475 | "\n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | "
bab
10123
\n", 493 | "
" 494 | ], 495 | "text/plain": [ 496 | " b a b\n", 497 | "10 1 2 3" 498 | ] 499 | }, 500 | "execution_count": 59, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "df.iloc[[0]]" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 60, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/plain": [ 517 | "10 1\n", 518 | "11 4\n", 519 | "11 7\n", 520 | "Name: b, dtype: int64" 521 | ] 522 | }, 523 | "execution_count": 60, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "# ilocは、2次元の場合、行番号と列番号で指定\n", 530 | "df.iloc[:, 0]" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 61, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/html": [ 541 | "
\n", 542 | "\n", 555 | "\n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | "
b
101
114
117
\n", 577 | "
" 578 | ], 579 | "text/plain": [ 580 | " b\n", 581 | "10 1\n", 582 | "11 4\n", 583 | "11 7" 584 | ] 585 | }, 586 | "execution_count": 61, 587 | "metadata": {}, 588 | "output_type": "execute_result" 589 | } 590 | ], 591 | "source": [ 592 | "df.iloc[:, [0]]" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 62, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "data": { 602 | "text/plain": [ 603 | "0" 604 | ] 605 | }, 606 | "execution_count": 62, 607 | "metadata": {}, 608 | "output_type": "execute_result" 609 | } 610 | ], 611 | "source": [ 612 | "# get_loc関数は、pd.Indexクラスが持っている関数\n", 613 | "# Indexの値を検索して該当したインデックス番号を返す関数\n", 614 | "\n", 615 | "df.index.get_loc(10)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 63, 621 | "metadata": {}, 622 | "outputs": [ 623 | { 624 | "data": { 625 | "text/plain": [ 626 | "slice(1, 3, None)" 627 | ] 628 | }, 629 | "execution_count": 63, 630 | "metadata": {}, 631 | "output_type": "execute_result" 632 | } 633 | ], 634 | "source": [ 635 | "# 連続だとsliceになる\n", 636 | "df.index.get_loc(11)" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 64, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "text/plain": [ 647 | "1" 648 | ] 649 | }, 650 | "execution_count": 64, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "df.columns.get_loc('a')" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 65, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/plain": [ 667 | "array([ True, False, True])" 668 | ] 669 | }, 670 | "execution_count": 65, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | } 674 | ], 675 | "source": [ 676 | "# 非連続だとarrayになる\n", 677 | "df.columns.get_loc('b')" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [] 686 | } 687 | ], 688 | "metadata": { 689 | "kernelspec": { 690 | "display_name": "Python 3", 691 | "language": "python", 692 | "name": "python3" 693 | }, 694 | "language_info": { 695 | "codemirror_mode": { 696 | "name": "ipython", 697 | "version": 3 698 | }, 699 | "file_extension": ".py", 700 | "mimetype": "text/x-python", 701 | "name": "python", 702 | "nbconvert_exporter": "python", 703 | "pygments_lexer": "ipython3", 704 | "version": "3.7.0" 705 | } 706 | }, 707 | "nbformat": 4, 708 | "nbformat_minor": 2 709 | } 710 | -------------------------------------------------------------------------------- /7-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例7-1. k-meansの適用例を実行するコード" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/plain": [ 18 | "(-0.07086790846959401,\n", 19 | " 1.0713297683670353,\n", 20 | " -0.08072533667047069,\n", 21 | " 1.082584430361733)" 22 | ] 23 | }, 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | }, 28 | { 29 | "data": { 30 | "image/png": "\n", 31 | "text/plain": [ 32 | "
" 33 | ] 34 | }, 35 | "metadata": { 36 | "needs_background": "light" 37 | }, 38 | "output_type": "display_data" 39 | } 40 | ], 41 | "source": [ 42 | "%matplotlib inline\n", 43 | "import numpy as np\n", 44 | "from sklearn.cluster import KMeans\n", 45 | "from sklearn.datasets import make_blobs\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "\n", 48 | "n_data = 1000\n", 49 | "seed = 1\n", 50 | "n_centers = 4\n", 51 | "\n", 52 | "# 4つの二変量正規分布に従うデータを生成し、k-meansを実行する\n", 53 | "blobs, blob_labels = make_blobs(n_samples=n_data, n_features=2, centers=n_centers, random_state=seed)\n", 54 | "clusters_blob = KMeans(n_clusters=n_centers, random_state=seed).fit_predict(blobs)\n", 55 | "\n", 56 | "# 2次元の一様分布に従うデータを生成し、k-meansを実行する\n", 57 | "uniform = np.random.rand(n_data, 2)\n", 58 | "clusters_uniform = KMeans(n_clusters=n_centers, random_state=seed).fit_predict(uniform)\n", 59 | "\n", 60 | "# 結果を可視化するためのMatplotlibのおまじない\n", 61 | "figure = plt.figure()\n", 62 | "plt.subplot(221)\n", 63 | "plt.scatter(blobs[:, 0], blobs[:, 1], c=blob_labels, cmap='gist_rainbow')\n", 64 | "plt.title('(a) Four randomly generated blobs', fontsize=14)\n", 65 | "plt.axis('off')\n", 66 | "\n", 67 | "plt.subplot(222)\n", 68 | "plt.scatter(blobs[:, 0], blobs[:, 1], c=clusters_blob, cmap='gist_rainbow')\n", 69 | "plt.title('(b) Clusters found via K-means', fontsize=14)\n", 70 | "plt.axis('off')\n", 71 | "\n", 72 | "plt.subplot(223)\n", 73 | "plt.scatter(uniform[:, 0], uniform[:, 1])\n", 74 | "plt.title('(c) 1000 randomly generated points', fontsize=14)\n", 75 | "plt.axis('off')\n", 76 | "\n", 77 | "plt.subplot(224)\n", 78 | "plt.scatter(uniform[:, 0], uniform[:, 1], c=clusters_uniform, cmap='gist_rainbow')\n", 79 | "plt.title('(d) Clusters found via K-means', fontsize=14)\n", 80 | "plt.axis('off')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.7.0" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /5-6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例5-6. ビンカウンティングの例" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/plain": [ 18 | "(8208, 24)" 19 | ] 20 | }, 21 | "execution_count": 10, 22 | "metadata": {}, 23 | "output_type": "execute_result" 24 | } 25 | ], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "# train_subsetを読み込み(サンプルコードの対象のデータ件数が、8208件です。)\n", 29 | "df = pd.read_csv('data/avazu/train_subset.csv')\n", 30 | "\n", 31 | "df.shape" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 11, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "906" 43 | ] 44 | }, 45 | "execution_count": 11, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "\n", 52 | "# device_idが何種類あるか計算\n", 53 | "len(df['device_id'].unique())" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 12, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | "
clicksno_clickstotal_clicksN+N-log_N+
a99f214a1279587871570.1787060.8212940.217591
c357dbff212140.1428570.8571430.166667
25635c832021.0000000.000000inf
e62f12612130.6666670.3333332.000000
135f7d9a2021.0000000.000000inf
9af874782021.0000000.000000inf
77cf1a271011.0000000.000000inf
d62216cc1011.0000000.000000inf
fcc5c7c01011.0000000.000000inf
7181509e1011.0000000.000000inf
2a32a3ca1011.0000000.000000inf
2ad16ba31011.0000000.000000inf
938f494b1011.0000000.000000inf
24dbae831011.0000000.000000inf
ca9b95aa1011.0000000.000000inf
7c3698991011.0000000.000000inf
3bf8c26c1011.0000000.000000inf
453781281011.0000000.000000inf
023ca1f91011.0000000.000000inf
3b9ab74d1011.0000000.000000inf
9eb9a9721011.0000000.000000inf
59bcd1ae1011.0000000.000000inf
7f8c00b41011.0000000.000000inf
8d61d7eb1011.0000000.000000inf
b441c41f1011.0000000.000000inf
e317838f1011.0000000.000000inf
2e7d4b651011.0000000.000000inf
70d97ece1011.0000000.000000inf
9809e6c91011.0000000.000000inf
cb73ba551011.0000000.000000inf
.....................
0c3bbac00110.0000001.0000000.000000
34c9f9080110.0000001.0000000.000000
41a1ae5f0110.0000001.0000000.000000
5d03585e0110.0000001.0000000.000000
7d242dfd0110.0000001.0000000.000000
9ffa05630110.0000001.0000000.000000
2fd589900110.0000001.0000000.000000
01e47a3d0110.0000001.0000000.000000
bcd5195e0110.0000001.0000000.000000
a318236b0110.0000001.0000000.000000
1168ce020110.0000001.0000000.000000
83c34e930110.0000001.0000000.000000
c8c310320110.0000001.0000000.000000
004270bf0110.0000001.0000000.000000
e6d0facc0110.0000001.0000000.000000
844524dc0110.0000001.0000000.000000
c90a30a10110.0000001.0000000.000000
5015495b0110.0000001.0000000.000000
a12e8a450110.0000001.0000000.000000
7b5c2c3b0110.0000001.0000000.000000
4a6b5af30110.0000001.0000000.000000
2c8ae68c0110.0000001.0000000.000000
84565c920110.0000001.0000000.000000
2a6fe2a50110.0000001.0000000.000000
e0f700060110.0000001.0000000.000000
cef4c8cc0110.0000001.0000000.000000
7f4b1f1e0110.0000001.0000000.000000
7efe14f00110.0000001.0000000.000000
02b99e770110.0000001.0000000.000000
cbb50c1c0110.0000001.0000000.000000
\n", 642 | "

906 rows × 6 columns

\n", 643 | "
" 644 | ], 645 | "text/plain": [ 646 | " clicks no_clicks total_clicks N+ N- log_N+\n", 647 | "a99f214a 1279 5878 7157 0.178706 0.821294 0.217591\n", 648 | "c357dbff 2 12 14 0.142857 0.857143 0.166667\n", 649 | "25635c83 2 0 2 1.000000 0.000000 inf\n", 650 | "e62f1261 2 1 3 0.666667 0.333333 2.000000\n", 651 | "135f7d9a 2 0 2 1.000000 0.000000 inf\n", 652 | "9af87478 2 0 2 1.000000 0.000000 inf\n", 653 | "77cf1a27 1 0 1 1.000000 0.000000 inf\n", 654 | "d62216cc 1 0 1 1.000000 0.000000 inf\n", 655 | "fcc5c7c0 1 0 1 1.000000 0.000000 inf\n", 656 | "7181509e 1 0 1 1.000000 0.000000 inf\n", 657 | "2a32a3ca 1 0 1 1.000000 0.000000 inf\n", 658 | "2ad16ba3 1 0 1 1.000000 0.000000 inf\n", 659 | "938f494b 1 0 1 1.000000 0.000000 inf\n", 660 | "24dbae83 1 0 1 1.000000 0.000000 inf\n", 661 | "ca9b95aa 1 0 1 1.000000 0.000000 inf\n", 662 | "7c369899 1 0 1 1.000000 0.000000 inf\n", 663 | "3bf8c26c 1 0 1 1.000000 0.000000 inf\n", 664 | "45378128 1 0 1 1.000000 0.000000 inf\n", 665 | "023ca1f9 1 0 1 1.000000 0.000000 inf\n", 666 | "3b9ab74d 1 0 1 1.000000 0.000000 inf\n", 667 | "9eb9a972 1 0 1 1.000000 0.000000 inf\n", 668 | "59bcd1ae 1 0 1 1.000000 0.000000 inf\n", 669 | "7f8c00b4 1 0 1 1.000000 0.000000 inf\n", 670 | "8d61d7eb 1 0 1 1.000000 0.000000 inf\n", 671 | "b441c41f 1 0 1 1.000000 0.000000 inf\n", 672 | "e317838f 1 0 1 1.000000 0.000000 inf\n", 673 | "2e7d4b65 1 0 1 1.000000 0.000000 inf\n", 674 | "70d97ece 1 0 1 1.000000 0.000000 inf\n", 675 | "9809e6c9 1 0 1 1.000000 0.000000 inf\n", 676 | "cb73ba55 1 0 1 1.000000 0.000000 inf\n", 677 | "... ... ... ... ... ... ...\n", 678 | "0c3bbac0 0 1 1 0.000000 1.000000 0.000000\n", 679 | "34c9f908 0 1 1 0.000000 1.000000 0.000000\n", 680 | "41a1ae5f 0 1 1 0.000000 1.000000 0.000000\n", 681 | "5d03585e 0 1 1 0.000000 1.000000 0.000000\n", 682 | "7d242dfd 0 1 1 0.000000 1.000000 0.000000\n", 683 | "9ffa0563 0 1 1 0.000000 1.000000 0.000000\n", 684 | "2fd58990 0 1 1 0.000000 1.000000 0.000000\n", 685 | "01e47a3d 0 1 1 0.000000 1.000000 0.000000\n", 686 | "bcd5195e 0 1 1 0.000000 1.000000 0.000000\n", 687 | "a318236b 0 1 1 0.000000 1.000000 0.000000\n", 688 | "1168ce02 0 1 1 0.000000 1.000000 0.000000\n", 689 | "83c34e93 0 1 1 0.000000 1.000000 0.000000\n", 690 | "c8c31032 0 1 1 0.000000 1.000000 0.000000\n", 691 | "004270bf 0 1 1 0.000000 1.000000 0.000000\n", 692 | "e6d0facc 0 1 1 0.000000 1.000000 0.000000\n", 693 | "844524dc 0 1 1 0.000000 1.000000 0.000000\n", 694 | "c90a30a1 0 1 1 0.000000 1.000000 0.000000\n", 695 | "5015495b 0 1 1 0.000000 1.000000 0.000000\n", 696 | "a12e8a45 0 1 1 0.000000 1.000000 0.000000\n", 697 | "7b5c2c3b 0 1 1 0.000000 1.000000 0.000000\n", 698 | "4a6b5af3 0 1 1 0.000000 1.000000 0.000000\n", 699 | "2c8ae68c 0 1 1 0.000000 1.000000 0.000000\n", 700 | "84565c92 0 1 1 0.000000 1.000000 0.000000\n", 701 | "2a6fe2a5 0 1 1 0.000000 1.000000 0.000000\n", 702 | "e0f70006 0 1 1 0.000000 1.000000 0.000000\n", 703 | "cef4c8cc 0 1 1 0.000000 1.000000 0.000000\n", 704 | "7f4b1f1e 0 1 1 0.000000 1.000000 0.000000\n", 705 | "7efe14f0 0 1 1 0.000000 1.000000 0.000000\n", 706 | "02b99e77 0 1 1 0.000000 1.000000 0.000000\n", 707 | "cbb50c1c 0 1 1 0.000000 1.000000 0.000000\n", 708 | "\n", 709 | "[906 rows x 6 columns]" 710 | ] 711 | }, 712 | "execution_count": 12, 713 | "metadata": {}, 714 | "output_type": "execute_result" 715 | } 716 | ], 717 | "source": [ 718 | "def click_counting(x, bin_column):\n", 719 | " clicks = pd.Series(x[x['click'] > 0][bin_column].value_counts(), name='clicks')\n", 720 | " no_clicks = pd.Series(x[x['click'] < 1][bin_column].value_counts(), name='no_clicks')\n", 721 | "\n", 722 | " counts = pd.DataFrame([clicks,no_clicks]).T.fillna('0')\n", 723 | " counts['total_clicks'] = counts['clicks'].astype('int64') + counts['no_clicks'].astype('int64')\n", 724 | " return counts\n", 725 | "\n", 726 | "def bin_counting(counts):\n", 727 | " counts['N+'] = counts['clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))\n", 728 | " counts['N-'] = counts['no_clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))\n", 729 | " counts['log_N+'] = counts['N+'].divide(counts['N-'])\n", 730 | " # Bin Countingのプロパティを返すだけの場合、ここでフィルタリングを実行\n", 731 | " bin_counts = counts.filter(items= ['N+', 'N-', 'log_N+'])\n", 732 | " return counts, bin_counts\n", 733 | "\n", 734 | "# device_idを対象としたビンカウンティング\n", 735 | "bin_column = 'device_id'\n", 736 | "device_clicks = click_counting(df.filter(items=[bin_column, 'click']), bin_column)\n", 737 | "device_all, device_bin_counts = bin_counting(device_clicks)\n", 738 | "\n", 739 | "device_all" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 13, 745 | "metadata": {}, 746 | "outputs": [ 747 | { 748 | "data": { 749 | "text/html": [ 750 | "
\n", 751 | "\n", 764 | "\n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | "
N+N-log_N+
a99f214a0.1787060.8212940.217591
c357dbff0.1428570.8571430.166667
25635c831.0000000.000000inf
e62f12610.6666670.3333332.000000
135f7d9a1.0000000.000000inf
9af874781.0000000.000000inf
77cf1a271.0000000.000000inf
d62216cc1.0000000.000000inf
fcc5c7c01.0000000.000000inf
7181509e1.0000000.000000inf
2a32a3ca1.0000000.000000inf
2ad16ba31.0000000.000000inf
938f494b1.0000000.000000inf
24dbae831.0000000.000000inf
ca9b95aa1.0000000.000000inf
7c3698991.0000000.000000inf
3bf8c26c1.0000000.000000inf
453781281.0000000.000000inf
023ca1f91.0000000.000000inf
3b9ab74d1.0000000.000000inf
9eb9a9721.0000000.000000inf
59bcd1ae1.0000000.000000inf
7f8c00b41.0000000.000000inf
8d61d7eb1.0000000.000000inf
b441c41f1.0000000.000000inf
e317838f1.0000000.000000inf
2e7d4b651.0000000.000000inf
70d97ece1.0000000.000000inf
9809e6c91.0000000.000000inf
cb73ba551.0000000.000000inf
............
0c3bbac00.0000001.0000000.000000
34c9f9080.0000001.0000000.000000
41a1ae5f0.0000001.0000000.000000
5d03585e0.0000001.0000000.000000
7d242dfd0.0000001.0000000.000000
9ffa05630.0000001.0000000.000000
2fd589900.0000001.0000000.000000
01e47a3d0.0000001.0000000.000000
bcd5195e0.0000001.0000000.000000
a318236b0.0000001.0000000.000000
1168ce020.0000001.0000000.000000
83c34e930.0000001.0000000.000000
c8c310320.0000001.0000000.000000
004270bf0.0000001.0000000.000000
e6d0facc0.0000001.0000000.000000
844524dc0.0000001.0000000.000000
c90a30a10.0000001.0000000.000000
5015495b0.0000001.0000000.000000
a12e8a450.0000001.0000000.000000
7b5c2c3b0.0000001.0000000.000000
4a6b5af30.0000001.0000000.000000
2c8ae68c0.0000001.0000000.000000
84565c920.0000001.0000000.000000
2a6fe2a50.0000001.0000000.000000
e0f700060.0000001.0000000.000000
cef4c8cc0.0000001.0000000.000000
7f4b1f1e0.0000001.0000000.000000
7efe14f00.0000001.0000000.000000
02b99e770.0000001.0000000.000000
cbb50c1c0.0000001.0000000.000000
\n", 1142 | "

906 rows × 3 columns

\n", 1143 | "
" 1144 | ], 1145 | "text/plain": [ 1146 | " N+ N- log_N+\n", 1147 | "a99f214a 0.178706 0.821294 0.217591\n", 1148 | "c357dbff 0.142857 0.857143 0.166667\n", 1149 | "25635c83 1.000000 0.000000 inf\n", 1150 | "e62f1261 0.666667 0.333333 2.000000\n", 1151 | "135f7d9a 1.000000 0.000000 inf\n", 1152 | "9af87478 1.000000 0.000000 inf\n", 1153 | "77cf1a27 1.000000 0.000000 inf\n", 1154 | "d62216cc 1.000000 0.000000 inf\n", 1155 | "fcc5c7c0 1.000000 0.000000 inf\n", 1156 | "7181509e 1.000000 0.000000 inf\n", 1157 | "2a32a3ca 1.000000 0.000000 inf\n", 1158 | "2ad16ba3 1.000000 0.000000 inf\n", 1159 | "938f494b 1.000000 0.000000 inf\n", 1160 | "24dbae83 1.000000 0.000000 inf\n", 1161 | "ca9b95aa 1.000000 0.000000 inf\n", 1162 | "7c369899 1.000000 0.000000 inf\n", 1163 | "3bf8c26c 1.000000 0.000000 inf\n", 1164 | "45378128 1.000000 0.000000 inf\n", 1165 | "023ca1f9 1.000000 0.000000 inf\n", 1166 | "3b9ab74d 1.000000 0.000000 inf\n", 1167 | "9eb9a972 1.000000 0.000000 inf\n", 1168 | "59bcd1ae 1.000000 0.000000 inf\n", 1169 | "7f8c00b4 1.000000 0.000000 inf\n", 1170 | "8d61d7eb 1.000000 0.000000 inf\n", 1171 | "b441c41f 1.000000 0.000000 inf\n", 1172 | "e317838f 1.000000 0.000000 inf\n", 1173 | "2e7d4b65 1.000000 0.000000 inf\n", 1174 | "70d97ece 1.000000 0.000000 inf\n", 1175 | "9809e6c9 1.000000 0.000000 inf\n", 1176 | "cb73ba55 1.000000 0.000000 inf\n", 1177 | "... ... ... ...\n", 1178 | "0c3bbac0 0.000000 1.000000 0.000000\n", 1179 | "34c9f908 0.000000 1.000000 0.000000\n", 1180 | "41a1ae5f 0.000000 1.000000 0.000000\n", 1181 | "5d03585e 0.000000 1.000000 0.000000\n", 1182 | "7d242dfd 0.000000 1.000000 0.000000\n", 1183 | "9ffa0563 0.000000 1.000000 0.000000\n", 1184 | "2fd58990 0.000000 1.000000 0.000000\n", 1185 | "01e47a3d 0.000000 1.000000 0.000000\n", 1186 | "bcd5195e 0.000000 1.000000 0.000000\n", 1187 | "a318236b 0.000000 1.000000 0.000000\n", 1188 | "1168ce02 0.000000 1.000000 0.000000\n", 1189 | "83c34e93 0.000000 1.000000 0.000000\n", 1190 | "c8c31032 0.000000 1.000000 0.000000\n", 1191 | "004270bf 0.000000 1.000000 0.000000\n", 1192 | "e6d0facc 0.000000 1.000000 0.000000\n", 1193 | "844524dc 0.000000 1.000000 0.000000\n", 1194 | "c90a30a1 0.000000 1.000000 0.000000\n", 1195 | "5015495b 0.000000 1.000000 0.000000\n", 1196 | "a12e8a45 0.000000 1.000000 0.000000\n", 1197 | "7b5c2c3b 0.000000 1.000000 0.000000\n", 1198 | "4a6b5af3 0.000000 1.000000 0.000000\n", 1199 | "2c8ae68c 0.000000 1.000000 0.000000\n", 1200 | "84565c92 0.000000 1.000000 0.000000\n", 1201 | "2a6fe2a5 0.000000 1.000000 0.000000\n", 1202 | "e0f70006 0.000000 1.000000 0.000000\n", 1203 | "cef4c8cc 0.000000 1.000000 0.000000\n", 1204 | "7f4b1f1e 0.000000 1.000000 0.000000\n", 1205 | "7efe14f0 0.000000 1.000000 0.000000\n", 1206 | "02b99e77 0.000000 1.000000 0.000000\n", 1207 | "cbb50c1c 0.000000 1.000000 0.000000\n", 1208 | "\n", 1209 | "[906 rows x 3 columns]" 1210 | ] 1211 | }, 1212 | "execution_count": 13, 1213 | "metadata": {}, 1214 | "output_type": "execute_result" 1215 | } 1216 | ], 1217 | "source": [ 1218 | "device_bin_counts" 1219 | ] 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "execution_count": 14, 1224 | "metadata": {}, 1225 | "outputs": [ 1226 | { 1227 | "data": { 1228 | "text/plain": [ 1229 | "906" 1230 | ] 1231 | }, 1232 | "execution_count": 14, 1233 | "metadata": {}, 1234 | "output_type": "execute_result" 1235 | } 1236 | ], 1237 | "source": [ 1238 | "len(device_bin_counts)" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": 15, 1244 | "metadata": {}, 1245 | "outputs": [ 1246 | { 1247 | "data": { 1248 | "text/html": [ 1249 | "
\n", 1250 | "\n", 1263 | "\n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | "
clicksno_clickstotal_clicksN+N-log_N+
a99f214a1279587871570.1787060.8212940.217591
c357dbff212140.1428570.8571430.166667
a167aa830770.0000001.0000000.000000
d2bbb6400660.0000001.0000000.000000
\n", 1314 | "
" 1315 | ], 1316 | "text/plain": [ 1317 | " clicks no_clicks total_clicks N+ N- log_N+\n", 1318 | "a99f214a 1279 5878 7157 0.178706 0.821294 0.217591\n", 1319 | "c357dbff 2 12 14 0.142857 0.857143 0.166667\n", 1320 | "a167aa83 0 7 7 0.000000 1.000000 0.000000\n", 1321 | "d2bbb640 0 6 6 0.000000 1.000000 0.000000" 1322 | ] 1323 | }, 1324 | "execution_count": 15, 1325 | "metadata": {}, 1326 | "output_type": "execute_result" 1327 | } 1328 | ], 1329 | "source": [ 1330 | "device_all.sort_values(by = 'total_clicks', ascending=False).head(4)" 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "execution_count": null, 1336 | "metadata": {}, 1337 | "outputs": [], 1338 | "source": [] 1339 | } 1340 | ], 1341 | "metadata": { 1342 | "kernelspec": { 1343 | "display_name": "Python 3", 1344 | "language": "python", 1345 | "name": "python3" 1346 | }, 1347 | "language_info": { 1348 | "codemirror_mode": { 1349 | "name": "ipython", 1350 | "version": 3 1351 | }, 1352 | "file_extension": ".py", 1353 | "mimetype": "text/x-python", 1354 | "name": "python", 1355 | "nbconvert_exporter": "python", 1356 | "pygments_lexer": "ipython3", 1357 | "version": "3.7.0" 1358 | } 1359 | }, 1360 | "nbformat": 4, 1361 | "nbformat_minor": 2 1362 | } 1363 | -------------------------------------------------------------------------------- /2-2-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 例2-2. Yelp データセット内の店舗に対するレビュー件数の可視化" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "
\n", 19 | "\n", 32 | "\n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | "
business_idcategoriescityfull_addresslatitudelongitudenameneighborhoodsopenreview_countstarsstatetype
0rncjoVoEFUJGCUoC1JgnUA[Accountants, Professional Services, Tax Servi...Peoria8466 W Peoria Ave\\nSte 6\\nPeoria, AZ 8534533.581867-112.241596Peoria Income Tax Service[]True35.0AZbusiness
10FNFSzCFP_rGUoJx8W7tJg[Sporting Goods, Bikes, Shopping]Phoenix2149 W Wood Dr\\nPhoenix, AZ 8502933.604054-112.105933Bike Doctor[]True55.0AZbusiness
23f_lyB6vFK48ukH6ScvLHg[]Phoenix1134 N Central Ave\\nPhoenix, AZ 8500433.460526-112.073933Valley Permaculture Alliance[]True45.0AZbusiness
3usAsSV36QmUej8--yvN-dg[Food, Grocery]Phoenix845 W Southern Ave\\nPhoenix, AZ 8504133.392210-112.085377Food City[]True53.5AZbusiness
4PzOqRohWw7F7YEPBz6AubA[Food, Bagels, Delis, Restaurants]Glendale Az6520 W Happy Valley Rd\\nSte 101\\nGlendale Az, ...33.712797-112.200264Hot Bagels & Deli[]True143.5AZbusiness
\n", 134 | "
" 135 | ], 136 | "text/plain": [ 137 | " business_id categories \\\n", 138 | "0 rncjoVoEFUJGCUoC1JgnUA [Accountants, Professional Services, Tax Servi... \n", 139 | "1 0FNFSzCFP_rGUoJx8W7tJg [Sporting Goods, Bikes, Shopping] \n", 140 | "2 3f_lyB6vFK48ukH6ScvLHg [] \n", 141 | "3 usAsSV36QmUej8--yvN-dg [Food, Grocery] \n", 142 | "4 PzOqRohWw7F7YEPBz6AubA [Food, Bagels, Delis, Restaurants] \n", 143 | "\n", 144 | " city full_address latitude \\\n", 145 | "0 Peoria 8466 W Peoria Ave\\nSte 6\\nPeoria, AZ 85345 33.581867 \n", 146 | "1 Phoenix 2149 W Wood Dr\\nPhoenix, AZ 85029 33.604054 \n", 147 | "2 Phoenix 1134 N Central Ave\\nPhoenix, AZ 85004 33.460526 \n", 148 | "3 Phoenix 845 W Southern Ave\\nPhoenix, AZ 85041 33.392210 \n", 149 | "4 Glendale Az 6520 W Happy Valley Rd\\nSte 101\\nGlendale Az, ... 33.712797 \n", 150 | "\n", 151 | " longitude name neighborhoods open review_count \\\n", 152 | "0 -112.241596 Peoria Income Tax Service [] True 3 \n", 153 | "1 -112.105933 Bike Doctor [] True 5 \n", 154 | "2 -112.073933 Valley Permaculture Alliance [] True 4 \n", 155 | "3 -112.085377 Food City [] True 5 \n", 156 | "4 -112.200264 Hot Bagels & Deli [] True 14 \n", 157 | "\n", 158 | " stars state type \n", 159 | "0 5.0 AZ business \n", 160 | "1 5.0 AZ business \n", 161 | "2 5.0 AZ business \n", 162 | "3 3.5 AZ business \n", 163 | "4 3.5 AZ business " 164 | ] 165 | }, 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "import pandas as pd\n", 173 | "import json\n", 174 | "\n", 175 | "# 店舗についてのデータを読み込む\n", 176 | "with open('data/yelp/yelp_academic_dataset_business.json') as biz_file:\n", 177 | " biz_df = pd.DataFrame([json.loads(x) for x in biz_file.readlines()])\n", 178 | "\n", 179 | "biz_df.head()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "Text(0, 0.5, 'Occurrence')" 191 | ] 192 | }, 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | }, 197 | { 198 | "data": { 199 | "image/png": "\n", 200 | "text/plain": [ 201 | "
" 202 | ] 203 | }, 204 | "metadata": {}, 205 | "output_type": "display_data" 206 | } 207 | ], 208 | "source": [ 209 | "%matplotlib inline\n", 210 | "import matplotlib.pyplot as plt\n", 211 | "import seaborn as sns\n", 212 | "\n", 213 | "# レビュー件数のヒストグラムを描画\n", 214 | "sns.set_style('whitegrid')\n", 215 | "fig, ax = plt.subplots()\n", 216 | "biz_df['review_count'].hist(ax=ax, bins=100)\n", 217 | "ax.set_yscale('log')\n", 218 | "ax.tick_params(labelsize=14)\n", 219 | "ax.set_xlabel('Review Count', fontsize=14)\n", 220 | "ax.set_ylabel('Occurrence', fontsize=14)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## 例 2-3. 固定幅によるカウントの離散化" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 9, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "array([37, 12, 72, 9, 75, 5, 79, 64, 16, 1, 76, 71, 6, 25, 50, 20, 18,\n", 239 | " 84, 11, 28])" 240 | ] 241 | }, 242 | "execution_count": 9, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "import numpy as np\n", 249 | "\n", 250 | "np.random.seed(seed=1)\n", 251 | "\n", 252 | "# 0から99までの整数を一様分布からランダムに20個生成する\n", 253 | "small_counts = np.random.randint(0, 100, 20)\n", 254 | "\n", 255 | "small_counts" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 10, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "array([3, 1, 7, 0, 7, 0, 7, 6, 1, 0, 7, 7, 0, 2, 5, 2, 1, 8, 1, 2])" 267 | ] 268 | }, 269 | "execution_count": 10, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "# 除算により 0-9 までの階級を割り当てる\n", 276 | "np.floor_divide(small_counts, 10)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 11, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "array([2., 3., 4., 1., 0., 2., 2., 3., 3., 4., 4., 1., 1., 3., 2., 2., 4.])" 288 | ] 289 | }, 290 | "execution_count": 11, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "# 複数の桁にまたがるカウントデータの配列\n", 297 | "large_counts = [296, 8286, 64011, 80, 3, 725, 867, 2215, 7689, 11495, 91897, \n", 298 | " 44, 28, 7971, 926, 122, 22222]\n", 299 | "# 対数変換により指数幅の階級を割り当てる\n", 300 | "np.floor(np.log10(large_counts))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## 例 2-4. Yelp ビジネスレビュー件数の十分位数を計算する" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 12, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "0.1 3.0\n", 319 | "0.2 3.0\n", 320 | "0.3 4.0\n", 321 | "0.4 5.0\n", 322 | "0.5 6.0\n", 323 | "0.6 8.0\n", 324 | "0.7 12.0\n", 325 | "0.8 23.0\n", 326 | "0.9 50.0\n", 327 | "Name: review_count, dtype: float64" 328 | ] 329 | }, 330 | "execution_count": 12, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "deciles = biz_df['review_count'].quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])\n", 337 | "deciles" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 13, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "Text(0, 0.5, 'Occurrence')" 349 | ] 350 | }, 351 | "execution_count": 13, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | }, 355 | { 356 | "data": { 357 | "image/png": "\n", 358 | "text/plain": [ 359 | "
" 360 | ] 361 | }, 362 | "metadata": {}, 363 | "output_type": "display_data" 364 | } 365 | ], 366 | "source": [ 367 | "# ヒストグラムに十分位数を上書きする\n", 368 | "sns.set_style('whitegrid')\n", 369 | "fig, ax = plt.subplots()\n", 370 | "biz_df['review_count'].hist(ax=ax, bins=100)\n", 371 | "for pos in deciles:\n", 372 | " handle = plt.axvline(pos, color='r')\n", 373 | "ax.legend([handle], ['deciles'], fontsize=14)\n", 374 | "ax.set_yscale('log')\n", 375 | "ax.set_xscale('log')\n", 376 | "ax.tick_params(labelsize=14)\n", 377 | "ax.set_xlabel('Review Count', fontsize=14)\n", 378 | "ax.set_ylabel('Occurrence', fontsize=14)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "## 例 2-5. 分位数によるカウントの離散化" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 14, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "array([1, 2, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 2, 1, 0, 3])" 397 | ] 398 | }, 399 | "execution_count": 14, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "# 例2-3の large_count を引き続き使う\n", 406 | "import pandas as pd\n", 407 | "\n", 408 | "# 四分位数に変換\n", 409 | "pd.qcut(large_counts, 4, labels=False)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 15, 415 | "metadata": {}, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "0.25 122.0\n", 421 | "0.50 926.0\n", 422 | "0.75 8286.0\n", 423 | "dtype: float64" 424 | ] 425 | }, 426 | "execution_count": 15, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "# 分位数の計算\n", 433 | "large_counts_series = pd.Series(large_counts)\n", 434 | "large_counts_series.quantile([0.25, 0.5, 0.75])" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "## 例 2-6. 対数変換の前後でレビュー件数のヒストグラムを比較する" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 16, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/plain": [ 452 | "Text(0, 0.5, 'Occurrence')" 453 | ] 454 | }, 455 | "execution_count": 16, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | }, 459 | { 460 | "data": { 461 | "image/png": "\n", 462 | "text/plain": [ 463 | "
" 464 | ] 465 | }, 466 | "metadata": {}, 467 | "output_type": "display_data" 468 | } 469 | ], 470 | "source": [ 471 | "import numpy as np\n", 472 | "\n", 473 | "# 例2-2で読み込んだ Yelp データセットの\n", 474 | "# データフレーム biz_df を使用して、レビュー件数を対数変換する。\n", 475 | "# レビュー件数 0 を対数変換してマイナス無限大になるのを防ぐために\n", 476 | "# 対数変換の前に生データに 1 を加算していることに注意。\n", 477 | "biz_df['log_review_count'] = np.log10(biz_df['review_count'] + 1)\n", 478 | "\n", 479 | "fig, (ax1, ax2) = plt.subplots(2,1)\n", 480 | "biz_df['review_count'].hist(ax=ax1, bins=100)\n", 481 | "ax1.tick_params(labelsize=14)\n", 482 | "ax1.set_xlabel('review_count', fontsize=14)\n", 483 | "ax1.set_ylabel('Occurrence', fontsize=14)\n", 484 | "\n", 485 | "biz_df['log_review_count'].hist(ax=ax2, bins=100)\n", 486 | "ax2.tick_params(labelsize=14)\n", 487 | "ax2.set_xlabel('log10(review_count))', fontsize=14)\n", 488 | "ax2.set_ylabel('Occurrence', fontsize=14)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 3", 502 | "language": "python", 503 | "name": "python3" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 3 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython3", 515 | "version": "3.7.0" 516 | } 517 | }, 518 | "nbformat": 4, 519 | "nbformat_minor": 2 520 | } 521 | --------------------------------------------------------------------------------