├── 5-5.ipynb
├── 2-2-1.ipynb
├── check_version.ipynb
├── 3.ipynb
├── 5-1.ipynb
├── explain_loc_iloc_get_loc.ipynb
├── 7-1.ipynb
├── 5-6.ipynb
└── 2-2-2.ipynb
/5-5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例5-5. 特徴量ハッシング(別名「ハッシングトリック」)"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import json\n",
18 | "\n",
19 | "# 最初の10,000件のレビューを読み込み\n",
20 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n",
21 | " js = []\n",
22 | " for i in range(10000):\n",
23 | " js.append(json.loads(f.readline()))\n",
24 | "\n",
25 | "review_df = pd.DataFrame(js)\n",
26 | "# mにbusiness_idのユニーク数を代入\n",
27 | "m = len(review_df['business_id'].unique())\n",
28 | "\n",
29 | "m"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/plain": [
40 | "['9yKzy9PApeiPPOUJEtnvkg',\n",
41 | " 'ZRJwVLyzEJq1VAihDhYiow',\n",
42 | " '6oRAC4uyJCsJl1X0WZpVSA',\n",
43 | " '_1QQZuf4zZOyFCvXc0o6Vg',\n",
44 | " '6ozycU1RpktNG2-1BroVtw']"
45 | ]
46 | },
47 | "execution_count": 2,
48 | "metadata": {},
49 | "output_type": "execute_result"
50 | }
51 | ],
52 | "source": [
53 | "from sklearn.feature_extraction import FeatureHasher\n",
54 | "h = FeatureHasher(n_features=m, input_type='string')\n",
55 | "f = h.transform(review_df['business_id'])\n",
56 | "\n",
57 | "# 変換後の特徴量が解釈が困難であることを確認\n",
58 | "review_df['business_id'].unique().tolist()[0:5]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 7,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "array([[0., 0., 0., ..., 0., 0., 0.],\n",
70 | " [0., 0., 0., ..., 0., 0., 0.],\n",
71 | " [0., 0., 0., ..., 0., 0., 0.],\n",
72 | " ...,\n",
73 | " [0., 0., 0., ..., 0., 0., 0.],\n",
74 | " [0., 0., 0., ..., 0., 0., 0.],\n",
75 | " [0., 0., 0., ..., 0., 0., 0.]])"
76 | ]
77 | },
78 | "execution_count": 7,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "f.toarray()"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 8,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "Our pandas Series, in bytes: 790104\n",
97 | "Our hashed numpy array, in bytes: 56\n"
98 | ]
99 | }
100 | ],
101 | "source": [
102 | "# 変換後の特徴量のストレージサイズが大きく減っていることを確認\n",
103 | "from sys import getsizeof\n",
104 | "print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id']))\n",
105 | "print('Our hashed numpy array, in bytes: ', getsizeof(f))"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": []
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.7.0"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 2
137 | }
138 |
--------------------------------------------------------------------------------
/2-2-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例2-1. The Echo Nest データセットの再生回数の二値化"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 4,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/html": [
18 | "
\n",
19 | "\n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " | \n",
36 | " 0 | \n",
37 | " 1 | \n",
38 | " 2 | \n",
39 | "
\n",
40 | " \n",
41 | " \n",
42 | " \n",
43 | " | 0 | \n",
44 | " b80344d063b5ccb3212f76538f3d9e43d87dca9e | \n",
45 | " SOAKIMP12A8C130995 | \n",
46 | " 1 | \n",
47 | "
\n",
48 | " \n",
49 | " | 1 | \n",
50 | " b80344d063b5ccb3212f76538f3d9e43d87dca9e | \n",
51 | " SOAPDEY12A81C210A9 | \n",
52 | " 1 | \n",
53 | "
\n",
54 | " \n",
55 | " | 2 | \n",
56 | " b80344d063b5ccb3212f76538f3d9e43d87dca9e | \n",
57 | " SOBBMDR12A8C13253B | \n",
58 | " 1 | \n",
59 | "
\n",
60 | " \n",
61 | " | 3 | \n",
62 | " b80344d063b5ccb3212f76538f3d9e43d87dca9e | \n",
63 | " SOBFNSP12AF72A0E22 | \n",
64 | " 1 | \n",
65 | "
\n",
66 | " \n",
67 | " | 4 | \n",
68 | " b80344d063b5ccb3212f76538f3d9e43d87dca9e | \n",
69 | " SOBFOVM12A58A7D494 | \n",
70 | " 1 | \n",
71 | "
\n",
72 | " \n",
73 | "
\n",
74 | "
"
75 | ],
76 | "text/plain": [
77 | " 0 1 2\n",
78 | "0 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAKIMP12A8C130995 1\n",
79 | "1 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAPDEY12A81C210A9 1\n",
80 | "2 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBBMDR12A8C13253B 1\n",
81 | "3 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFNSP12AF72A0E22 1\n",
82 | "4 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBFOVM12A58A7D494 1"
83 | ]
84 | },
85 | "execution_count": 4,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "import pandas as pd\n",
92 | "listen_count = pd.read_csv('data/millionsong/train_triplets.txt.zip', header=None, delimiter='\\t', compression='zip')\n",
93 | "\n",
94 | "# このデータはユーザID、曲ID、再生回数の3つの列で構成されます。\n",
95 | "# 再生回数 0 を含まないため、単に再生回数の列をすべて 1 で上書きすることで\n",
96 | "# 再生回数を二値化できます。\n",
97 | "listen_count.iloc[:, 2] = 1\n",
98 | "\n",
99 | "listen_count.head()"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": []
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.7.0"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 2
131 | }
132 |
--------------------------------------------------------------------------------
/check_version.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 21,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "'1.15.3'"
12 | ]
13 | },
14 | "execution_count": 21,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import numpy as np\n",
21 | "np.version.full_version"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 22,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "text/plain": [
32 | "'1.1.0'"
33 | ]
34 | },
35 | "execution_count": 22,
36 | "metadata": {},
37 | "output_type": "execute_result"
38 | }
39 | ],
40 | "source": [
41 | "import scipy as sp\n",
42 | "sp.version.full_version"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 23,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "'3.0.0'"
54 | ]
55 | },
56 | "execution_count": 23,
57 | "metadata": {},
58 | "output_type": "execute_result"
59 | }
60 | ],
61 | "source": [
62 | "import matplotlib\n",
63 | "matplotlib.__version__"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 24,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "'0.20.0'"
75 | ]
76 | },
77 | "execution_count": 24,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "import sklearn\n",
84 | "sklearn.__version__"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 20,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "\n",
97 | "INSTALLED VERSIONS\n",
98 | "------------------\n",
99 | "commit: None\n",
100 | "python: 3.7.0.final.0\n",
101 | "python-bits: 64\n",
102 | "OS: Darwin\n",
103 | "OS-release: 17.7.0\n",
104 | "machine: x86_64\n",
105 | "processor: i386\n",
106 | "byteorder: little\n",
107 | "LC_ALL: None\n",
108 | "LANG: ja_JP.UTF-8\n",
109 | "LOCALE: ja_JP.UTF-8\n",
110 | "\n",
111 | "pandas: 0.23.4\n",
112 | "pytest: None\n",
113 | "pip: 18.0\n",
114 | "setuptools: 40.2.0\n",
115 | "Cython: None\n",
116 | "numpy: 1.15.3\n",
117 | "scipy: 1.1.0\n",
118 | "pyarrow: None\n",
119 | "xarray: None\n",
120 | "IPython: 7.0.1\n",
121 | "sphinx: None\n",
122 | "patsy: None\n",
123 | "dateutil: 2.7.3\n",
124 | "pytz: 2018.5\n",
125 | "blosc: None\n",
126 | "bottleneck: None\n",
127 | "tables: None\n",
128 | "numexpr: None\n",
129 | "feather: None\n",
130 | "matplotlib: 3.0.0\n",
131 | "openpyxl: None\n",
132 | "xlrd: None\n",
133 | "xlwt: None\n",
134 | "xlsxwriter: None\n",
135 | "lxml: None\n",
136 | "bs4: None\n",
137 | "html5lib: None\n",
138 | "sqlalchemy: None\n",
139 | "pymysql: None\n",
140 | "psycopg2: None\n",
141 | "jinja2: 2.10\n",
142 | "s3fs: None\n",
143 | "fastparquet: None\n",
144 | "pandas_gbq: None\n",
145 | "pandas_datareader: None\n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "import pandas as pd\n",
151 | "pd.show_versions()"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": []
160 | }
161 | ],
162 | "metadata": {
163 | "kernelspec": {
164 | "display_name": "Python 3",
165 | "language": "python",
166 | "name": "python3"
167 | },
168 | "language_info": {
169 | "codemirror_mode": {
170 | "name": "ipython",
171 | "version": 3
172 | },
173 | "file_extension": ".py",
174 | "mimetype": "text/x-python",
175 | "name": "python",
176 | "nbconvert_exporter": "python",
177 | "pygments_lexer": "ipython3",
178 | "version": "3.7.0"
179 | }
180 | },
181 | "nbformat": 4,
182 | "nbformat_minor": 2
183 | }
184 |
--------------------------------------------------------------------------------
/3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例3-1. nグラムの計算"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "29222 368943 881620\n"
20 | ]
21 | },
22 | {
23 | "data": {
24 | "text/plain": [
25 | "['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03']"
26 | ]
27 | },
28 | "execution_count": 1,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "import pandas as pd\n",
35 | "import json\n",
36 | "from sklearn.feature_extraction.text import CountVectorizer\n",
37 | "\n",
38 | "# 最初の 10,000 件のレビューを読み込む\n",
39 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n",
40 | " js = []\n",
41 | " for i in range(10000):\n",
42 | " js.append(json.loads(f.readline()))\n",
43 | "review_df = pd.DataFrame(js)\n",
44 | "\n",
45 | "# scikit-learn の CountVectorizer を使ってユニグラム(BoW)、\n",
46 | "# バイグラム、トライグラムの特徴量変換器を作成する。\n",
47 | "# CountVectorizer はデフォルトでは1文字の単語を無視するが、\n",
48 | "# これは意味のない単語を除外するため実用的である。\n",
49 | "# ただしここでは全ての単語を含むように設定している。\n",
50 | "bow_converter = CountVectorizer(token_pattern='(?u)\\\\b\\\\w+\\\\b')\n",
51 | "bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\\\b\\\\w+\\\\b')\n",
52 | "trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\\\b\\\\w+\\\\b')\n",
53 | "\n",
54 | "# 変換器を適用し、語彙数を確認する\n",
55 | "bow_converter.fit(review_df['text'])\n",
56 | "words = bow_converter.get_feature_names()\n",
57 | "\n",
58 | "bigram_converter.fit(review_df['text'])\n",
59 | "bigrams = bigram_converter.get_feature_names()\n",
60 | "\n",
61 | "trigram_converter.fit(review_df['text'])\n",
62 | "trigrams = trigram_converter.get_feature_names()\n",
63 | "\n",
64 | "print (len(words), len(bigrams), len(trigrams))\n",
65 | "\n",
66 | "# n-グラムを確認する\n",
67 | "words[:10]"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 3,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "data": {
77 | "text/plain": [
78 | "['zuzu was',\n",
79 | " 'zuzus room',\n",
80 | " 'zweigel wine',\n",
81 | " 'zwiebel kräuter',\n",
82 | " 'zy world',\n",
83 | " 'zzed in',\n",
84 | " 'éclairs napoleons',\n",
85 | " 'école lenôtre',\n",
86 | " 'ém all',\n",
87 | " 'òc châm']"
88 | ]
89 | },
90 | "execution_count": 3,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "bigrams[-10:]"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 4,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/plain": [
107 | "['0 0 eye',\n",
108 | " '0 20 less',\n",
109 | " '0 39 oz',\n",
110 | " '0 39 pizza',\n",
111 | " '0 5 i',\n",
112 | " '0 50 to',\n",
113 | " '0 6 can',\n",
114 | " '0 75 oysters',\n",
115 | " '0 75 that',\n",
116 | " '0 75 to']"
117 | ]
118 | },
119 | "execution_count": 4,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "trigrams[:10]"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "## 例3-2. 品詞タグ付けとチャンク化"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 2,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "['General', 'PROPN', 'NNP']\n",
145 | "['Manager', 'PROPN', 'NNP']\n",
146 | "['Scott', 'PROPN', 'NNP']\n",
147 | "['Petello', 'PROPN', 'NNP']\n",
148 | "['is', 'VERB', 'VBZ']\n",
149 | "['a', 'DET', 'DT']\n",
150 | "['good', 'ADJ', 'JJ']\n",
151 | "['egg', 'NOUN', 'NN']\n",
152 | "['!', 'PUNCT', '.']\n",
153 | "['!', 'PUNCT', '.']\n",
154 | "['!', 'PUNCT', '.']\n",
155 | "['Not', 'ADV', 'RB']\n",
156 | "['to', 'PART', 'TO']\n",
157 | "['go', 'VERB', 'VB']\n",
158 | "['into', 'ADP', 'IN']\n",
159 | "['detail', 'NOUN', 'NN']\n",
160 | "[',', 'PUNCT', ',']\n",
161 | "['but', 'CCONJ', 'CC']\n",
162 | "['let', 'VERB', 'VB']\n",
163 | "['me', 'PRON', 'PRP']\n",
164 | "['assure', 'VERB', 'VB']\n",
165 | "['you', 'PRON', 'PRP']\n",
166 | "['if', 'ADP', 'IN']\n",
167 | "['you', 'PRON', 'PRP']\n",
168 | "['have', 'VERB', 'VBP']\n",
169 | "['any', 'DET', 'DT']\n",
170 | "['issues', 'NOUN', 'NNS']\n",
171 | "['(', 'PUNCT', '-LRB-']\n",
172 | "['albeit', 'ADP', 'IN']\n",
173 | "['rare', 'ADJ', 'JJ']\n",
174 | "[')', 'PUNCT', '-RRB-']\n",
175 | "['speak', 'VERB', 'VBP']\n",
176 | "['with', 'ADP', 'IN']\n",
177 | "['Scott', 'PROPN', 'NNP']\n",
178 | "['and', 'CCONJ', 'CC']\n",
179 | "['treat', 'VERB', 'VB']\n",
180 | "['the', 'DET', 'DT']\n",
181 | "['guy', 'NOUN', 'NN']\n",
182 | "['with', 'ADP', 'IN']\n",
183 | "['some', 'DET', 'DT']\n",
184 | "['respect', 'NOUN', 'NN']\n",
185 | "['as', 'ADP', 'IN']\n",
186 | "['you', 'PRON', 'PRP']\n",
187 | "['state', 'VERB', 'VBP']\n",
188 | "['your', 'ADJ', 'PRP$']\n",
189 | "['case', 'NOUN', 'NN']\n",
190 | "['and', 'CCONJ', 'CC']\n",
191 | "['I', 'PRON', 'PRP']\n",
192 | "[\"'d\", 'VERB', 'MD']\n",
193 | "['be', 'VERB', 'VB']\n",
194 | "['surprised', 'ADJ', 'JJ']\n",
195 | "['if', 'ADP', 'IN']\n",
196 | "['you', 'PRON', 'PRP']\n",
197 | "['do', 'VERB', 'VBP']\n",
198 | "[\"n't\", 'ADV', 'RB']\n",
199 | "['walk', 'VERB', 'VB']\n",
200 | "['out', 'ADV', 'RB']\n",
201 | "['totally', 'ADV', 'RB']\n",
202 | "['satisfied', 'ADJ', 'JJ']\n",
203 | "['as', 'ADP', 'IN']\n",
204 | "['I', 'PRON', 'PRP']\n",
205 | "['just', 'ADV', 'RB']\n",
206 | "['did', 'VERB', 'VBD']\n",
207 | "['.', 'PUNCT', '.']\n",
208 | "['Like', 'INTJ', 'UH']\n",
209 | "['I', 'PRON', 'PRP']\n",
210 | "['always', 'ADV', 'RB']\n",
211 | "['say', 'VERB', 'VBP']\n",
212 | "['.....', 'PUNCT', 'NFP']\n",
213 | "['\"', 'PUNCT', '``']\n",
214 | "['Mistakes', 'NOUN', 'NNS']\n",
215 | "['are', 'VERB', 'VBP']\n",
216 | "['inevitable', 'ADJ', 'JJ']\n",
217 | "[',', 'PUNCT', ',']\n",
218 | "['it', 'PRON', 'PRP']\n",
219 | "[\"'s\", 'VERB', 'VBZ']\n",
220 | "['how', 'ADV', 'WRB']\n",
221 | "['we', 'PRON', 'PRP']\n",
222 | "['recover', 'VERB', 'VBP']\n",
223 | "['from', 'ADP', 'IN']\n",
224 | "['them', 'PRON', 'PRP']\n",
225 | "['that', 'ADJ', 'WDT']\n",
226 | "['is', 'VERB', 'VBZ']\n",
227 | "['important', 'ADJ', 'JJ']\n",
228 | "['\"', 'PUNCT', \"''\"]\n",
229 | "['!', 'PUNCT', '.']\n",
230 | "['!', 'PUNCT', '.']\n",
231 | "['!', 'PUNCT', '.']\n",
232 | "['\\n\\n', 'SPACE', '_SP']\n",
233 | "['Thanks', 'NOUN', 'NNS']\n",
234 | "['to', 'ADP', 'IN']\n",
235 | "['Scott', 'PROPN', 'NNP']\n",
236 | "['and', 'CCONJ', 'CC']\n",
237 | "['his', 'ADJ', 'PRP$']\n",
238 | "['awesome', 'ADJ', 'JJ']\n",
239 | "['staff', 'NOUN', 'NN']\n",
240 | "['.', 'PUNCT', '.']\n",
241 | "['You', 'PRON', 'PRP']\n",
242 | "[\"'ve\", 'VERB', 'VB']\n",
243 | "['got', 'VERB', 'VBN']\n",
244 | "['a', 'DET', 'DT']\n",
245 | "['customer', 'NOUN', 'NN']\n",
246 | "['for', 'ADP', 'IN']\n",
247 | "['life', 'NOUN', 'NN']\n",
248 | "['!', 'PUNCT', '.']\n",
249 | "['!', 'PUNCT', '.']\n",
250 | "['..........', 'PUNCT', 'NFP']\n",
251 | "[':', 'PUNCT', ':']\n",
252 | "['^', 'PUNCT', 'NFP']\n",
253 | "[')', 'PUNCT', '-RRB-']\n",
254 | "[General Manager Scott Petello, a good egg, detail, me, you, you, any issues, Scott, the guy, some respect, you, your case, I, you, I, I, Mistakes, it, we, them, Thanks, Scott, his awesome staff, You, a customer, life]\n"
255 | ]
256 | }
257 | ],
258 | "source": [
259 | "import pandas as pd\n",
260 | "import json\n",
261 | "\n",
262 | "# 最初の10レビューを読み込む\n",
263 | "with open('data/yelp/yelp_academic_dataset_review.json') as f:\n",
264 | " js = []\n",
265 | " for i in range(10):\n",
266 | " js.append(json.loads(f.readline()))\n",
267 | "review_df = pd.DataFrame(js)\n",
268 | "\n",
269 | "# まずは Spacy を使った方法\n",
270 | "import spacy\n",
271 | "# 言語モデル(英語)を読み込む\n",
272 | "nlp = spacy.load('en')\n",
273 | "\n",
274 | "# spaCy の言語モデルを使ってテキストから Pandas Series を作成する\n",
275 | "doc_df = review_df['text'].apply(nlp)\n",
276 | "\n",
277 | "# spaCy は細かい品詞タグを .pos_ で、粗い品詞タグを .tag_ で提供します\n",
278 | "for doc in doc_df[4]:\n",
279 | " print([doc.text, doc.pos_, doc.tag_])\n",
280 | "\n",
281 | "# spaCy は基本的な名詞句も .noun_chunks で提供します\n",
282 | "print([chunk for chunk in doc_df[4].noun_chunks])"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 7,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "data": {
292 | "text/plain": [
293 | "[('General', 'NNP'),\n",
294 | " ('Manager', 'NNP'),\n",
295 | " ('Scott', 'NNP'),\n",
296 | " ('Petello', 'NNP'),\n",
297 | " ('is', 'VBZ'),\n",
298 | " ('a', 'DT'),\n",
299 | " ('good', 'JJ'),\n",
300 | " ('egg', 'NN'),\n",
301 | " ('Not', 'RB'),\n",
302 | " ('to', 'TO'),\n",
303 | " ('go', 'VB'),\n",
304 | " ('into', 'IN'),\n",
305 | " ('detail', 'NN'),\n",
306 | " ('but', 'CC'),\n",
307 | " ('let', 'VB'),\n",
308 | " ('me', 'PRP'),\n",
309 | " ('assure', 'VB'),\n",
310 | " ('you', 'PRP'),\n",
311 | " ('if', 'IN'),\n",
312 | " ('you', 'PRP'),\n",
313 | " ('have', 'VBP'),\n",
314 | " ('any', 'DT'),\n",
315 | " ('issues', 'NNS'),\n",
316 | " ('albeit', 'IN'),\n",
317 | " ('rare', 'NN'),\n",
318 | " ('speak', 'NN'),\n",
319 | " ('with', 'IN'),\n",
320 | " ('Scott', 'NNP'),\n",
321 | " ('and', 'CC'),\n",
322 | " ('treat', 'VB'),\n",
323 | " ('the', 'DT'),\n",
324 | " ('guy', 'NN'),\n",
325 | " ('with', 'IN'),\n",
326 | " ('some', 'DT'),\n",
327 | " ('respect', 'NN'),\n",
328 | " ('as', 'IN'),\n",
329 | " ('you', 'PRP'),\n",
330 | " ('state', 'NN'),\n",
331 | " ('your', 'PRP$'),\n",
332 | " ('case', 'NN'),\n",
333 | " ('and', 'CC'),\n",
334 | " ('I', 'PRP'),\n",
335 | " (\"'d\", 'MD'),\n",
336 | " ('be', 'VB'),\n",
337 | " ('surprised', 'VBN'),\n",
338 | " ('if', 'IN'),\n",
339 | " ('you', 'PRP'),\n",
340 | " ('do', 'VBP'),\n",
341 | " (\"n't\", 'RB'),\n",
342 | " ('walk', 'VB'),\n",
343 | " ('out', 'RP'),\n",
344 | " ('totally', 'RB'),\n",
345 | " ('satisfied', 'JJ'),\n",
346 | " ('as', 'IN'),\n",
347 | " ('I', 'PRP'),\n",
348 | " ('just', 'RB'),\n",
349 | " ('did', 'VBD'),\n",
350 | " ('Like', 'IN'),\n",
351 | " ('I', 'PRP'),\n",
352 | " ('always', 'RB'),\n",
353 | " ('say', 'VBP'),\n",
354 | " ('..', 'VBP'),\n",
355 | " ('Mistakes', 'NNS'),\n",
356 | " ('are', 'VBP'),\n",
357 | " ('inevitable', 'JJ'),\n",
358 | " ('it', 'PRP'),\n",
359 | " (\"'s\", 'VBZ'),\n",
360 | " ('how', 'WRB'),\n",
361 | " ('we', 'PRP'),\n",
362 | " ('recover', 'VBP'),\n",
363 | " ('from', 'IN'),\n",
364 | " ('them', 'PRP'),\n",
365 | " ('that', 'WDT'),\n",
366 | " ('is', 'VBZ'),\n",
367 | " ('important', 'JJ'),\n",
368 | " ('Thanks', 'NNS'),\n",
369 | " ('to', 'TO'),\n",
370 | " ('Scott', 'NNP'),\n",
371 | " ('and', 'CC'),\n",
372 | " ('his', 'PRP$'),\n",
373 | " ('awesome', 'JJ'),\n",
374 | " ('staff', 'NN'),\n",
375 | " ('You', 'PRP'),\n",
376 | " (\"'ve\", 'VBP'),\n",
377 | " ('got', 'VBN'),\n",
378 | " ('a', 'DT'),\n",
379 | " ('customer', 'NN'),\n",
380 | " ('for', 'IN'),\n",
381 | " ('life', 'NN'),\n",
382 | " ('^', 'NN')]"
383 | ]
384 | },
385 | "execution_count": 7,
386 | "metadata": {},
387 | "output_type": "execute_result"
388 | }
389 | ],
390 | "source": [
391 | "# TextBlob ライブラリを使って同じことができる\n",
392 | "from textblob import TextBlob\n",
393 | "\n",
394 | "# TextBlob はデフォルトでは PatternTagger を使ってタグ付けを行う。\n",
395 | "# これは今回の例ではうまくいくが、文法の正しくない文章を含む場合は \n",
396 | "# NLTKTagger を使うことをおすすめする。\n",
397 | "blob_df = review_df['text'].apply(TextBlob)\n",
398 | "\n",
399 | "blob_df[4].tags"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 8,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "name": "stdout",
409 | "output_type": "stream",
410 | "text": [
411 | "['general manager', 'scott petello', 'good egg', 'scott', \"n't walk\", '... ..', 'mistakes', 'thanks', 'scott', 'awesome staff', '... ... ...']\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "print([np for np in blob_df[4].noun_phrases])"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {},
423 | "outputs": [],
424 | "source": []
425 | }
426 | ],
427 | "metadata": {
428 | "kernelspec": {
429 | "display_name": "Python 3",
430 | "language": "python",
431 | "name": "python3"
432 | },
433 | "language_info": {
434 | "codemirror_mode": {
435 | "name": "ipython",
436 | "version": 3
437 | },
438 | "file_extension": ".py",
439 | "mimetype": "text/x-python",
440 | "name": "python",
441 | "nbconvert_exporter": "python",
442 | "pygments_lexer": "ipython3",
443 | "version": "3.7.0"
444 | }
445 | },
446 | "nbformat": 4,
447 | "nbformat_minor": 2
448 | }
449 |
--------------------------------------------------------------------------------
/5-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例5-1. One-Hotエンコーディングとダミーコーディングを利用した線形回帰モデリング"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/plain": [
18 | "3333.3333333333335"
19 | ]
20 | },
21 | "execution_count": 1,
22 | "metadata": {},
23 | "output_type": "execute_result"
24 | }
25 | ],
26 | "source": [
27 | "import pandas as pd\n",
28 | "from sklearn import linear_model\n",
29 | "\n",
30 | "# 3つの都市におけるアパートの家賃のデータセットを設定\n",
31 | "df = pd.DataFrame({\n",
32 | " 'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC', 'Seattle', 'Seattle', 'Seattle'],\n",
33 | " 'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]\n",
34 | "})\n",
35 | "\n",
36 | "df['Rent'].mean()"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/html": [
47 | "\n",
48 | "\n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " | \n",
65 | " Rent | \n",
66 | " city_NYC | \n",
67 | " city_SF | \n",
68 | " city_Seattle | \n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " | 0 | \n",
74 | " 3999 | \n",
75 | " 0 | \n",
76 | " 1 | \n",
77 | " 0 | \n",
78 | "
\n",
79 | " \n",
80 | " | 1 | \n",
81 | " 4000 | \n",
82 | " 0 | \n",
83 | " 1 | \n",
84 | " 0 | \n",
85 | "
\n",
86 | " \n",
87 | " | 2 | \n",
88 | " 4001 | \n",
89 | " 0 | \n",
90 | " 1 | \n",
91 | " 0 | \n",
92 | "
\n",
93 | " \n",
94 | " | 3 | \n",
95 | " 3499 | \n",
96 | " 1 | \n",
97 | " 0 | \n",
98 | " 0 | \n",
99 | "
\n",
100 | " \n",
101 | " | 4 | \n",
102 | " 3500 | \n",
103 | " 1 | \n",
104 | " 0 | \n",
105 | " 0 | \n",
106 | "
\n",
107 | " \n",
108 | " | 5 | \n",
109 | " 3501 | \n",
110 | " 1 | \n",
111 | " 0 | \n",
112 | " 0 | \n",
113 | "
\n",
114 | " \n",
115 | " | 6 | \n",
116 | " 2499 | \n",
117 | " 0 | \n",
118 | " 0 | \n",
119 | " 1 | \n",
120 | "
\n",
121 | " \n",
122 | " | 7 | \n",
123 | " 2500 | \n",
124 | " 0 | \n",
125 | " 0 | \n",
126 | " 1 | \n",
127 | "
\n",
128 | " \n",
129 | " | 8 | \n",
130 | " 2501 | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 1 | \n",
134 | "
\n",
135 | " \n",
136 | "
\n",
137 | "
"
138 | ],
139 | "text/plain": [
140 | " Rent city_NYC city_SF city_Seattle\n",
141 | "0 3999 0 1 0\n",
142 | "1 4000 0 1 0\n",
143 | "2 4001 0 1 0\n",
144 | "3 3499 1 0 0\n",
145 | "4 3500 1 0 0\n",
146 | "5 3501 1 0 0\n",
147 | "6 2499 0 0 1\n",
148 | "7 2500 0 0 1\n",
149 | "8 2501 0 0 1"
150 | ]
151 | },
152 | "execution_count": 2,
153 | "metadata": {},
154 | "output_type": "execute_result"
155 | }
156 | ],
157 | "source": [
158 | "# One-Hotエンコーディングをカテゴリ値であるcity列に適用\n",
159 | "# 特徴量をOne-Hotエンコーディングで生成した列に、ターゲット変数を家賃に指定し、線形回帰モデルを学習\n",
160 | "one_hot_df = pd.get_dummies(df, prefix=['city'])\n",
161 | "one_hot_df"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 6,
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/plain": [
172 | "array([ 166.66666667, 666.66666667, -833.33333333])"
173 | ]
174 | },
175 | "execution_count": 6,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "model = linear_model.LinearRegression()\n",
182 | "model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']], one_hot_df['Rent'])\n",
183 | "model.coef_"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 7,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "3333.3333333333335"
195 | ]
196 | },
197 | "execution_count": 7,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "model.intercept_"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 8,
209 | "metadata": {
210 | "scrolled": true
211 | },
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/html": [
216 | "\n",
217 | "\n",
230 | "
\n",
231 | " \n",
232 | " \n",
233 | " | \n",
234 | " Rent | \n",
235 | " city_SF | \n",
236 | " city_Seattle | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " | 0 | \n",
242 | " 3999 | \n",
243 | " 1 | \n",
244 | " 0 | \n",
245 | "
\n",
246 | " \n",
247 | " | 1 | \n",
248 | " 4000 | \n",
249 | " 1 | \n",
250 | " 0 | \n",
251 | "
\n",
252 | " \n",
253 | " | 2 | \n",
254 | " 4001 | \n",
255 | " 1 | \n",
256 | " 0 | \n",
257 | "
\n",
258 | " \n",
259 | " | 3 | \n",
260 | " 3499 | \n",
261 | " 0 | \n",
262 | " 0 | \n",
263 | "
\n",
264 | " \n",
265 | " | 4 | \n",
266 | " 3500 | \n",
267 | " 0 | \n",
268 | " 0 | \n",
269 | "
\n",
270 | " \n",
271 | " | 5 | \n",
272 | " 3501 | \n",
273 | " 0 | \n",
274 | " 0 | \n",
275 | "
\n",
276 | " \n",
277 | " | 6 | \n",
278 | " 2499 | \n",
279 | " 0 | \n",
280 | " 1 | \n",
281 | "
\n",
282 | " \n",
283 | " | 7 | \n",
284 | " 2500 | \n",
285 | " 0 | \n",
286 | " 1 | \n",
287 | "
\n",
288 | " \n",
289 | " | 8 | \n",
290 | " 2501 | \n",
291 | " 0 | \n",
292 | " 1 | \n",
293 | "
\n",
294 | " \n",
295 | "
\n",
296 | "
"
297 | ],
298 | "text/plain": [
299 | " Rent city_SF city_Seattle\n",
300 | "0 3999 1 0\n",
301 | "1 4000 1 0\n",
302 | "2 4001 1 0\n",
303 | "3 3499 0 0\n",
304 | "4 3500 0 0\n",
305 | "5 3501 0 0\n",
306 | "6 2499 0 1\n",
307 | "7 2500 0 1\n",
308 | "8 2501 0 1"
309 | ]
310 | },
311 | "execution_count": 8,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "# ダミーコーディングを利用して線形回帰モデルを学習\n",
318 | "dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)\n",
319 | "dummy_df"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 9,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "data": {
329 | "text/plain": [
330 | "array([ 500., -1000.])"
331 | ]
332 | },
333 | "execution_count": 9,
334 | "metadata": {},
335 | "output_type": "execute_result"
336 | }
337 | ],
338 | "source": [
339 | "model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])\n",
340 | "model.coef_"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 10,
346 | "metadata": {},
347 | "outputs": [
348 | {
349 | "data": {
350 | "text/plain": [
351 | "3500.0"
352 | ]
353 | },
354 | "execution_count": 10,
355 | "metadata": {},
356 | "output_type": "execute_result"
357 | }
358 | ],
359 | "source": [
360 | "model.intercept_"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "## 例5-2. Effectコーディングを用いた線形回帰モデル"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 11,
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "data": {
377 | "text/html": [
378 | "\n",
379 | "\n",
392 | "
\n",
393 | " \n",
394 | " \n",
395 | " | \n",
396 | " Rent | \n",
397 | " city_SF | \n",
398 | " city_Seattle | \n",
399 | "
\n",
400 | " \n",
401 | " \n",
402 | " \n",
403 | " | 0 | \n",
404 | " 3999 | \n",
405 | " 1.0 | \n",
406 | " 0.0 | \n",
407 | "
\n",
408 | " \n",
409 | " | 1 | \n",
410 | " 4000 | \n",
411 | " 1.0 | \n",
412 | " 0.0 | \n",
413 | "
\n",
414 | " \n",
415 | " | 2 | \n",
416 | " 4001 | \n",
417 | " 1.0 | \n",
418 | " 0.0 | \n",
419 | "
\n",
420 | " \n",
421 | " | 3 | \n",
422 | " 3499 | \n",
423 | " -1.0 | \n",
424 | " -1.0 | \n",
425 | "
\n",
426 | " \n",
427 | " | 4 | \n",
428 | " 3500 | \n",
429 | " -1.0 | \n",
430 | " -1.0 | \n",
431 | "
\n",
432 | " \n",
433 | " | 5 | \n",
434 | " 3501 | \n",
435 | " -1.0 | \n",
436 | " -1.0 | \n",
437 | "
\n",
438 | " \n",
439 | " | 6 | \n",
440 | " 2499 | \n",
441 | " 0.0 | \n",
442 | " 1.0 | \n",
443 | "
\n",
444 | " \n",
445 | " | 7 | \n",
446 | " 2500 | \n",
447 | " 0.0 | \n",
448 | " 1.0 | \n",
449 | "
\n",
450 | " \n",
451 | " | 8 | \n",
452 | " 2501 | \n",
453 | " 0.0 | \n",
454 | " 1.0 | \n",
455 | "
\n",
456 | " \n",
457 | "
\n",
458 | "
"
459 | ],
460 | "text/plain": [
461 | " Rent city_SF city_Seattle\n",
462 | "0 3999 1.0 0.0\n",
463 | "1 4000 1.0 0.0\n",
464 | "2 4001 1.0 0.0\n",
465 | "3 3499 -1.0 -1.0\n",
466 | "4 3500 -1.0 -1.0\n",
467 | "5 3501 -1.0 -1.0\n",
468 | "6 2499 0.0 1.0\n",
469 | "7 2500 0.0 1.0\n",
470 | "8 2501 0.0 1.0"
471 | ]
472 | },
473 | "execution_count": 11,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | }
477 | ],
478 | "source": [
479 | "effect_df = dummy_df.copy()\n",
480 | "effect_df.loc[3:5, ['city_SF', 'city_Seattle']] = -1.0\n",
481 | "effect_df"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 12,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/plain": [
492 | "array([ 666.66666667, -833.33333333])"
493 | ]
494 | },
495 | "execution_count": 12,
496 | "metadata": {},
497 | "output_type": "execute_result"
498 | }
499 | ],
500 | "source": [
501 | "model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])\n",
502 | "model.coef_"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 13,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "3333.3333333333335"
514 | ]
515 | },
516 | "execution_count": 13,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "model.intercept_"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": null,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": []
531 | }
532 | ],
533 | "metadata": {
534 | "kernelspec": {
535 | "display_name": "Python 3",
536 | "language": "python",
537 | "name": "python3"
538 | },
539 | "language_info": {
540 | "codemirror_mode": {
541 | "name": "ipython",
542 | "version": 3
543 | },
544 | "file_extension": ".py",
545 | "mimetype": "text/x-python",
546 | "name": "python",
547 | "nbconvert_exporter": "python",
548 | "pygments_lexer": "ipython3",
549 | "version": "3.7.0"
550 | }
551 | },
552 | "nbformat": 4,
553 | "nbformat_minor": 2
554 | }
555 |
--------------------------------------------------------------------------------
/explain_loc_iloc_get_loc.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "\n",
12 | "\n",
25 | "
\n",
26 | " \n",
27 | " \n",
28 | " | \n",
29 | " b | \n",
30 | " a | \n",
31 | " b | \n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " \n",
36 | " | 10 | \n",
37 | " 1 | \n",
38 | " 2 | \n",
39 | " 3 | \n",
40 | "
\n",
41 | " \n",
42 | " | 11 | \n",
43 | " 4 | \n",
44 | " 5 | \n",
45 | " 6 | \n",
46 | "
\n",
47 | " \n",
48 | " | 11 | \n",
49 | " 7 | \n",
50 | " 8 | \n",
51 | " 9 | \n",
52 | "
\n",
53 | " \n",
54 | "
\n",
55 | "
"
56 | ],
57 | "text/plain": [
58 | " b a b\n",
59 | "10 1 2 3\n",
60 | "11 4 5 6\n",
61 | "11 7 8 9"
62 | ]
63 | },
64 | "execution_count": 2,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "import pandas as pd\n",
71 | "\n",
72 | "# DataFrame基礎知識\n",
73 | "# pandasにはIndexクラスがある\n",
74 | "# pandas.DataFrameには、indexとcolumnsというプロパティがあり、\n",
75 | "# indexとcolumnsプロパティはIndexクラスが設定される\n",
76 | "# pandas.Index https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.html\n",
77 | "\n",
78 | "df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=[10, 11, 11],columns=['b', 'a', 'b'])\n",
79 | "df"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 21,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/plain": [
90 | "b 1\n",
91 | "a 2\n",
92 | "b 3\n",
93 | "Name: 10, dtype: int64"
94 | ]
95 | },
96 | "execution_count": 21,
97 | "metadata": {},
98 | "output_type": "execute_result"
99 | }
100 | ],
101 | "source": [
102 | "# locは、1次元の場合、index値で指定(pd.DataFrameは、indexとcolumnsというプロパティを持っている)\n",
103 | "# 指定する値が1つで、該当件数が1件の場合pd.Seriesで返される\n",
104 | "df.loc[10]"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "data": {
114 | "text/html": [
115 | "\n",
116 | "\n",
129 | "
\n",
130 | " \n",
131 | " \n",
132 | " | \n",
133 | " b | \n",
134 | " a | \n",
135 | " b | \n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " \n",
140 | " | 10 | \n",
141 | " 1 | \n",
142 | " 2 | \n",
143 | " 3 | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " b a b\n",
151 | "10 1 2 3"
152 | ]
153 | },
154 | "execution_count": 4,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "# locは、1次元の場合、index値で指定\n",
161 | "# 指定する値がリストの場合は、該当件数が1件でもpd.DataFrameで返される\n",
162 | "df.loc[[10]]"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 5,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "data": {
172 | "text/html": [
173 | "\n",
174 | "\n",
187 | "
\n",
188 | " \n",
189 | " \n",
190 | " | \n",
191 | " b | \n",
192 | " a | \n",
193 | " b | \n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " \n",
198 | " | 11 | \n",
199 | " 4 | \n",
200 | " 5 | \n",
201 | " 6 | \n",
202 | "
\n",
203 | " \n",
204 | " | 11 | \n",
205 | " 7 | \n",
206 | " 8 | \n",
207 | " 9 | \n",
208 | "
\n",
209 | " \n",
210 | "
\n",
211 | "
"
212 | ],
213 | "text/plain": [
214 | " b a b\n",
215 | "11 4 5 6\n",
216 | "11 7 8 9"
217 | ]
218 | },
219 | "execution_count": 5,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "# locは、1次元の場合、index値で指定\n",
226 | "# 指定する値が1つでも、該当件数が2件以上の場合pd.DataFrameで返される\n",
227 | "df.loc[11]"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 6,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "10 2\n",
239 | "11 5\n",
240 | "11 8\n",
241 | "Name: a, dtype: int64"
242 | ]
243 | },
244 | "execution_count": 6,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n",
251 | "# 該当件数が1件の場合は、pd.Seriesになる\n",
252 | "df.loc[:, 'a']"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 7,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/html": [
263 | "\n",
264 | "\n",
277 | "
\n",
278 | " \n",
279 | " \n",
280 | " | \n",
281 | " a | \n",
282 | "
\n",
283 | " \n",
284 | " \n",
285 | " \n",
286 | " | 10 | \n",
287 | " 2 | \n",
288 | "
\n",
289 | " \n",
290 | " | 11 | \n",
291 | " 5 | \n",
292 | "
\n",
293 | " \n",
294 | " | 11 | \n",
295 | " 8 | \n",
296 | "
\n",
297 | " \n",
298 | "
\n",
299 | "
"
300 | ],
301 | "text/plain": [
302 | " a\n",
303 | "10 2\n",
304 | "11 5\n",
305 | "11 8"
306 | ]
307 | },
308 | "execution_count": 7,
309 | "metadata": {},
310 | "output_type": "execute_result"
311 | }
312 | ],
313 | "source": [
314 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n",
315 | "# リストで指定した場合は、該当件数が1件でもpd.DataFrameになる\n",
316 | "df.loc[:, ['a']]"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 8,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "\n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " | \n",
345 | " b | \n",
346 | " b | \n",
347 | "
\n",
348 | " \n",
349 | " \n",
350 | " \n",
351 | " | 10 | \n",
352 | " 1 | \n",
353 | " 3 | \n",
354 | "
\n",
355 | " \n",
356 | " | 11 | \n",
357 | " 4 | \n",
358 | " 6 | \n",
359 | "
\n",
360 | " \n",
361 | " | 11 | \n",
362 | " 7 | \n",
363 | " 9 | \n",
364 | "
\n",
365 | " \n",
366 | "
\n",
367 | "
"
368 | ],
369 | "text/plain": [
370 | " b b\n",
371 | "10 1 3\n",
372 | "11 4 6\n",
373 | "11 7 9"
374 | ]
375 | },
376 | "execution_count": 8,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n",
383 | "# 指定する値が1つでも、該当件数が2件以上の場合pd.DataFrameで返される\n",
384 | "\n"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 17,
390 | "metadata": {},
391 | "outputs": [
392 | {
393 | "data": {
394 | "text/plain": [
395 | "2"
396 | ]
397 | },
398 | "execution_count": 17,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "# locは、2次元の場合、1次元目がindex値、2次元目がcolumns値で指定\n",
405 | "# 返り値が1つだと値の型に応じた返り値になる\n",
406 | "df.loc[10, 'a']"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 18,
412 | "metadata": {},
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "numpy.int64"
418 | ]
419 | },
420 | "execution_count": 18,
421 | "metadata": {},
422 | "output_type": "execute_result"
423 | }
424 | ],
425 | "source": [
426 | "type(df.loc[10, 'a'])"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 58,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "data": {
436 | "text/plain": [
437 | "b 1\n",
438 | "a 2\n",
439 | "b 3\n",
440 | "Name: 10, dtype: int64"
441 | ]
442 | },
443 | "execution_count": 58,
444 | "metadata": {},
445 | "output_type": "execute_result"
446 | }
447 | ],
448 | "source": [
449 | "# ilocは、1次元の場合、行番号で指定\n",
450 | "df.iloc[0]"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": 59,
456 | "metadata": {},
457 | "outputs": [
458 | {
459 | "data": {
460 | "text/html": [
461 | "\n",
462 | "\n",
475 | "
\n",
476 | " \n",
477 | " \n",
478 | " | \n",
479 | " b | \n",
480 | " a | \n",
481 | " b | \n",
482 | "
\n",
483 | " \n",
484 | " \n",
485 | " \n",
486 | " | 10 | \n",
487 | " 1 | \n",
488 | " 2 | \n",
489 | " 3 | \n",
490 | "
\n",
491 | " \n",
492 | "
\n",
493 | "
"
494 | ],
495 | "text/plain": [
496 | " b a b\n",
497 | "10 1 2 3"
498 | ]
499 | },
500 | "execution_count": 59,
501 | "metadata": {},
502 | "output_type": "execute_result"
503 | }
504 | ],
505 | "source": [
506 | "df.iloc[[0]]"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 60,
512 | "metadata": {},
513 | "outputs": [
514 | {
515 | "data": {
516 | "text/plain": [
517 | "10 1\n",
518 | "11 4\n",
519 | "11 7\n",
520 | "Name: b, dtype: int64"
521 | ]
522 | },
523 | "execution_count": 60,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "# ilocは、2次元の場合、行番号と列番号で指定\n",
530 | "df.iloc[:, 0]"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 61,
536 | "metadata": {},
537 | "outputs": [
538 | {
539 | "data": {
540 | "text/html": [
541 | "\n",
542 | "\n",
555 | "
\n",
556 | " \n",
557 | " \n",
558 | " | \n",
559 | " b | \n",
560 | "
\n",
561 | " \n",
562 | " \n",
563 | " \n",
564 | " | 10 | \n",
565 | " 1 | \n",
566 | "
\n",
567 | " \n",
568 | " | 11 | \n",
569 | " 4 | \n",
570 | "
\n",
571 | " \n",
572 | " | 11 | \n",
573 | " 7 | \n",
574 | "
\n",
575 | " \n",
576 | "
\n",
577 | "
"
578 | ],
579 | "text/plain": [
580 | " b\n",
581 | "10 1\n",
582 | "11 4\n",
583 | "11 7"
584 | ]
585 | },
586 | "execution_count": 61,
587 | "metadata": {},
588 | "output_type": "execute_result"
589 | }
590 | ],
591 | "source": [
592 | "df.iloc[:, [0]]"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 62,
598 | "metadata": {},
599 | "outputs": [
600 | {
601 | "data": {
602 | "text/plain": [
603 | "0"
604 | ]
605 | },
606 | "execution_count": 62,
607 | "metadata": {},
608 | "output_type": "execute_result"
609 | }
610 | ],
611 | "source": [
612 | "# get_loc関数は、pd.Indexクラスが持っている関数\n",
613 | "# Indexの値を検索して該当したインデックス番号を返す関数\n",
614 | "\n",
615 | "df.index.get_loc(10)"
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": 63,
621 | "metadata": {},
622 | "outputs": [
623 | {
624 | "data": {
625 | "text/plain": [
626 | "slice(1, 3, None)"
627 | ]
628 | },
629 | "execution_count": 63,
630 | "metadata": {},
631 | "output_type": "execute_result"
632 | }
633 | ],
634 | "source": [
635 | "# 連続だとsliceになる\n",
636 | "df.index.get_loc(11)"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 64,
642 | "metadata": {},
643 | "outputs": [
644 | {
645 | "data": {
646 | "text/plain": [
647 | "1"
648 | ]
649 | },
650 | "execution_count": 64,
651 | "metadata": {},
652 | "output_type": "execute_result"
653 | }
654 | ],
655 | "source": [
656 | "df.columns.get_loc('a')"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 65,
662 | "metadata": {},
663 | "outputs": [
664 | {
665 | "data": {
666 | "text/plain": [
667 | "array([ True, False, True])"
668 | ]
669 | },
670 | "execution_count": 65,
671 | "metadata": {},
672 | "output_type": "execute_result"
673 | }
674 | ],
675 | "source": [
676 | "# 非連続だとarrayになる\n",
677 | "df.columns.get_loc('b')"
678 | ]
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": null,
683 | "metadata": {},
684 | "outputs": [],
685 | "source": []
686 | }
687 | ],
688 | "metadata": {
689 | "kernelspec": {
690 | "display_name": "Python 3",
691 | "language": "python",
692 | "name": "python3"
693 | },
694 | "language_info": {
695 | "codemirror_mode": {
696 | "name": "ipython",
697 | "version": 3
698 | },
699 | "file_extension": ".py",
700 | "mimetype": "text/x-python",
701 | "name": "python",
702 | "nbconvert_exporter": "python",
703 | "pygments_lexer": "ipython3",
704 | "version": "3.7.0"
705 | }
706 | },
707 | "nbformat": 4,
708 | "nbformat_minor": 2
709 | }
710 |
--------------------------------------------------------------------------------
/7-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例7-1. k-meansの適用例を実行するコード"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/plain": [
18 | "(-0.07086790846959401,\n",
19 | " 1.0713297683670353,\n",
20 | " -0.08072533667047069,\n",
21 | " 1.082584430361733)"
22 | ]
23 | },
24 | "execution_count": 1,
25 | "metadata": {},
26 | "output_type": "execute_result"
27 | },
28 | {
29 | "data": {
30 | "image/png": "\n",
31 | "text/plain": [
32 | ""
33 | ]
34 | },
35 | "metadata": {
36 | "needs_background": "light"
37 | },
38 | "output_type": "display_data"
39 | }
40 | ],
41 | "source": [
42 | "%matplotlib inline\n",
43 | "import numpy as np\n",
44 | "from sklearn.cluster import KMeans\n",
45 | "from sklearn.datasets import make_blobs\n",
46 | "import matplotlib.pyplot as plt\n",
47 | "\n",
48 | "n_data = 1000\n",
49 | "seed = 1\n",
50 | "n_centers = 4\n",
51 | "\n",
52 | "# 4つの二変量正規分布に従うデータを生成し、k-meansを実行する\n",
53 | "blobs, blob_labels = make_blobs(n_samples=n_data, n_features=2, centers=n_centers, random_state=seed)\n",
54 | "clusters_blob = KMeans(n_clusters=n_centers, random_state=seed).fit_predict(blobs)\n",
55 | "\n",
56 | "# 2次元の一様分布に従うデータを生成し、k-meansを実行する\n",
57 | "uniform = np.random.rand(n_data, 2)\n",
58 | "clusters_uniform = KMeans(n_clusters=n_centers, random_state=seed).fit_predict(uniform)\n",
59 | "\n",
60 | "# 結果を可視化するためのMatplotlibのおまじない\n",
61 | "figure = plt.figure()\n",
62 | "plt.subplot(221)\n",
63 | "plt.scatter(blobs[:, 0], blobs[:, 1], c=blob_labels, cmap='gist_rainbow')\n",
64 | "plt.title('(a) Four randomly generated blobs', fontsize=14)\n",
65 | "plt.axis('off')\n",
66 | "\n",
67 | "plt.subplot(222)\n",
68 | "plt.scatter(blobs[:, 0], blobs[:, 1], c=clusters_blob, cmap='gist_rainbow')\n",
69 | "plt.title('(b) Clusters found via K-means', fontsize=14)\n",
70 | "plt.axis('off')\n",
71 | "\n",
72 | "plt.subplot(223)\n",
73 | "plt.scatter(uniform[:, 0], uniform[:, 1])\n",
74 | "plt.title('(c) 1000 randomly generated points', fontsize=14)\n",
75 | "plt.axis('off')\n",
76 | "\n",
77 | "plt.subplot(224)\n",
78 | "plt.scatter(uniform[:, 0], uniform[:, 1], c=clusters_uniform, cmap='gist_rainbow')\n",
79 | "plt.title('(d) Clusters found via K-means', fontsize=14)\n",
80 | "plt.axis('off')"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": []
89 | }
90 | ],
91 | "metadata": {
92 | "kernelspec": {
93 | "display_name": "Python 3",
94 | "language": "python",
95 | "name": "python3"
96 | },
97 | "language_info": {
98 | "codemirror_mode": {
99 | "name": "ipython",
100 | "version": 3
101 | },
102 | "file_extension": ".py",
103 | "mimetype": "text/x-python",
104 | "name": "python",
105 | "nbconvert_exporter": "python",
106 | "pygments_lexer": "ipython3",
107 | "version": "3.7.0"
108 | }
109 | },
110 | "nbformat": 4,
111 | "nbformat_minor": 2
112 | }
113 |
--------------------------------------------------------------------------------
/5-6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例5-6. ビンカウンティングの例"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 10,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/plain": [
18 | "(8208, 24)"
19 | ]
20 | },
21 | "execution_count": 10,
22 | "metadata": {},
23 | "output_type": "execute_result"
24 | }
25 | ],
26 | "source": [
27 | "import pandas as pd\n",
28 | "# train_subsetを読み込み(サンプルコードの対象のデータ件数が、8208件です。)\n",
29 | "df = pd.read_csv('data/avazu/train_subset.csv')\n",
30 | "\n",
31 | "df.shape"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 11,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "906"
43 | ]
44 | },
45 | "execution_count": 11,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "\n",
52 | "# device_idが何種類あるか計算\n",
53 | "len(df['device_id'].unique())"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 12,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " | \n",
82 | " clicks | \n",
83 | " no_clicks | \n",
84 | " total_clicks | \n",
85 | " N+ | \n",
86 | " N- | \n",
87 | " log_N+ | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " | a99f214a | \n",
93 | " 1279 | \n",
94 | " 5878 | \n",
95 | " 7157 | \n",
96 | " 0.178706 | \n",
97 | " 0.821294 | \n",
98 | " 0.217591 | \n",
99 | "
\n",
100 | " \n",
101 | " | c357dbff | \n",
102 | " 2 | \n",
103 | " 12 | \n",
104 | " 14 | \n",
105 | " 0.142857 | \n",
106 | " 0.857143 | \n",
107 | " 0.166667 | \n",
108 | "
\n",
109 | " \n",
110 | " | 25635c83 | \n",
111 | " 2 | \n",
112 | " 0 | \n",
113 | " 2 | \n",
114 | " 1.000000 | \n",
115 | " 0.000000 | \n",
116 | " inf | \n",
117 | "
\n",
118 | " \n",
119 | " | e62f1261 | \n",
120 | " 2 | \n",
121 | " 1 | \n",
122 | " 3 | \n",
123 | " 0.666667 | \n",
124 | " 0.333333 | \n",
125 | " 2.000000 | \n",
126 | "
\n",
127 | " \n",
128 | " | 135f7d9a | \n",
129 | " 2 | \n",
130 | " 0 | \n",
131 | " 2 | \n",
132 | " 1.000000 | \n",
133 | " 0.000000 | \n",
134 | " inf | \n",
135 | "
\n",
136 | " \n",
137 | " | 9af87478 | \n",
138 | " 2 | \n",
139 | " 0 | \n",
140 | " 2 | \n",
141 | " 1.000000 | \n",
142 | " 0.000000 | \n",
143 | " inf | \n",
144 | "
\n",
145 | " \n",
146 | " | 77cf1a27 | \n",
147 | " 1 | \n",
148 | " 0 | \n",
149 | " 1 | \n",
150 | " 1.000000 | \n",
151 | " 0.000000 | \n",
152 | " inf | \n",
153 | "
\n",
154 | " \n",
155 | " | d62216cc | \n",
156 | " 1 | \n",
157 | " 0 | \n",
158 | " 1 | \n",
159 | " 1.000000 | \n",
160 | " 0.000000 | \n",
161 | " inf | \n",
162 | "
\n",
163 | " \n",
164 | " | fcc5c7c0 | \n",
165 | " 1 | \n",
166 | " 0 | \n",
167 | " 1 | \n",
168 | " 1.000000 | \n",
169 | " 0.000000 | \n",
170 | " inf | \n",
171 | "
\n",
172 | " \n",
173 | " | 7181509e | \n",
174 | " 1 | \n",
175 | " 0 | \n",
176 | " 1 | \n",
177 | " 1.000000 | \n",
178 | " 0.000000 | \n",
179 | " inf | \n",
180 | "
\n",
181 | " \n",
182 | " | 2a32a3ca | \n",
183 | " 1 | \n",
184 | " 0 | \n",
185 | " 1 | \n",
186 | " 1.000000 | \n",
187 | " 0.000000 | \n",
188 | " inf | \n",
189 | "
\n",
190 | " \n",
191 | " | 2ad16ba3 | \n",
192 | " 1 | \n",
193 | " 0 | \n",
194 | " 1 | \n",
195 | " 1.000000 | \n",
196 | " 0.000000 | \n",
197 | " inf | \n",
198 | "
\n",
199 | " \n",
200 | " | 938f494b | \n",
201 | " 1 | \n",
202 | " 0 | \n",
203 | " 1 | \n",
204 | " 1.000000 | \n",
205 | " 0.000000 | \n",
206 | " inf | \n",
207 | "
\n",
208 | " \n",
209 | " | 24dbae83 | \n",
210 | " 1 | \n",
211 | " 0 | \n",
212 | " 1 | \n",
213 | " 1.000000 | \n",
214 | " 0.000000 | \n",
215 | " inf | \n",
216 | "
\n",
217 | " \n",
218 | " | ca9b95aa | \n",
219 | " 1 | \n",
220 | " 0 | \n",
221 | " 1 | \n",
222 | " 1.000000 | \n",
223 | " 0.000000 | \n",
224 | " inf | \n",
225 | "
\n",
226 | " \n",
227 | " | 7c369899 | \n",
228 | " 1 | \n",
229 | " 0 | \n",
230 | " 1 | \n",
231 | " 1.000000 | \n",
232 | " 0.000000 | \n",
233 | " inf | \n",
234 | "
\n",
235 | " \n",
236 | " | 3bf8c26c | \n",
237 | " 1 | \n",
238 | " 0 | \n",
239 | " 1 | \n",
240 | " 1.000000 | \n",
241 | " 0.000000 | \n",
242 | " inf | \n",
243 | "
\n",
244 | " \n",
245 | " | 45378128 | \n",
246 | " 1 | \n",
247 | " 0 | \n",
248 | " 1 | \n",
249 | " 1.000000 | \n",
250 | " 0.000000 | \n",
251 | " inf | \n",
252 | "
\n",
253 | " \n",
254 | " | 023ca1f9 | \n",
255 | " 1 | \n",
256 | " 0 | \n",
257 | " 1 | \n",
258 | " 1.000000 | \n",
259 | " 0.000000 | \n",
260 | " inf | \n",
261 | "
\n",
262 | " \n",
263 | " | 3b9ab74d | \n",
264 | " 1 | \n",
265 | " 0 | \n",
266 | " 1 | \n",
267 | " 1.000000 | \n",
268 | " 0.000000 | \n",
269 | " inf | \n",
270 | "
\n",
271 | " \n",
272 | " | 9eb9a972 | \n",
273 | " 1 | \n",
274 | " 0 | \n",
275 | " 1 | \n",
276 | " 1.000000 | \n",
277 | " 0.000000 | \n",
278 | " inf | \n",
279 | "
\n",
280 | " \n",
281 | " | 59bcd1ae | \n",
282 | " 1 | \n",
283 | " 0 | \n",
284 | " 1 | \n",
285 | " 1.000000 | \n",
286 | " 0.000000 | \n",
287 | " inf | \n",
288 | "
\n",
289 | " \n",
290 | " | 7f8c00b4 | \n",
291 | " 1 | \n",
292 | " 0 | \n",
293 | " 1 | \n",
294 | " 1.000000 | \n",
295 | " 0.000000 | \n",
296 | " inf | \n",
297 | "
\n",
298 | " \n",
299 | " | 8d61d7eb | \n",
300 | " 1 | \n",
301 | " 0 | \n",
302 | " 1 | \n",
303 | " 1.000000 | \n",
304 | " 0.000000 | \n",
305 | " inf | \n",
306 | "
\n",
307 | " \n",
308 | " | b441c41f | \n",
309 | " 1 | \n",
310 | " 0 | \n",
311 | " 1 | \n",
312 | " 1.000000 | \n",
313 | " 0.000000 | \n",
314 | " inf | \n",
315 | "
\n",
316 | " \n",
317 | " | e317838f | \n",
318 | " 1 | \n",
319 | " 0 | \n",
320 | " 1 | \n",
321 | " 1.000000 | \n",
322 | " 0.000000 | \n",
323 | " inf | \n",
324 | "
\n",
325 | " \n",
326 | " | 2e7d4b65 | \n",
327 | " 1 | \n",
328 | " 0 | \n",
329 | " 1 | \n",
330 | " 1.000000 | \n",
331 | " 0.000000 | \n",
332 | " inf | \n",
333 | "
\n",
334 | " \n",
335 | " | 70d97ece | \n",
336 | " 1 | \n",
337 | " 0 | \n",
338 | " 1 | \n",
339 | " 1.000000 | \n",
340 | " 0.000000 | \n",
341 | " inf | \n",
342 | "
\n",
343 | " \n",
344 | " | 9809e6c9 | \n",
345 | " 1 | \n",
346 | " 0 | \n",
347 | " 1 | \n",
348 | " 1.000000 | \n",
349 | " 0.000000 | \n",
350 | " inf | \n",
351 | "
\n",
352 | " \n",
353 | " | cb73ba55 | \n",
354 | " 1 | \n",
355 | " 0 | \n",
356 | " 1 | \n",
357 | " 1.000000 | \n",
358 | " 0.000000 | \n",
359 | " inf | \n",
360 | "
\n",
361 | " \n",
362 | " | ... | \n",
363 | " ... | \n",
364 | " ... | \n",
365 | " ... | \n",
366 | " ... | \n",
367 | " ... | \n",
368 | " ... | \n",
369 | "
\n",
370 | " \n",
371 | " | 0c3bbac0 | \n",
372 | " 0 | \n",
373 | " 1 | \n",
374 | " 1 | \n",
375 | " 0.000000 | \n",
376 | " 1.000000 | \n",
377 | " 0.000000 | \n",
378 | "
\n",
379 | " \n",
380 | " | 34c9f908 | \n",
381 | " 0 | \n",
382 | " 1 | \n",
383 | " 1 | \n",
384 | " 0.000000 | \n",
385 | " 1.000000 | \n",
386 | " 0.000000 | \n",
387 | "
\n",
388 | " \n",
389 | " | 41a1ae5f | \n",
390 | " 0 | \n",
391 | " 1 | \n",
392 | " 1 | \n",
393 | " 0.000000 | \n",
394 | " 1.000000 | \n",
395 | " 0.000000 | \n",
396 | "
\n",
397 | " \n",
398 | " | 5d03585e | \n",
399 | " 0 | \n",
400 | " 1 | \n",
401 | " 1 | \n",
402 | " 0.000000 | \n",
403 | " 1.000000 | \n",
404 | " 0.000000 | \n",
405 | "
\n",
406 | " \n",
407 | " | 7d242dfd | \n",
408 | " 0 | \n",
409 | " 1 | \n",
410 | " 1 | \n",
411 | " 0.000000 | \n",
412 | " 1.000000 | \n",
413 | " 0.000000 | \n",
414 | "
\n",
415 | " \n",
416 | " | 9ffa0563 | \n",
417 | " 0 | \n",
418 | " 1 | \n",
419 | " 1 | \n",
420 | " 0.000000 | \n",
421 | " 1.000000 | \n",
422 | " 0.000000 | \n",
423 | "
\n",
424 | " \n",
425 | " | 2fd58990 | \n",
426 | " 0 | \n",
427 | " 1 | \n",
428 | " 1 | \n",
429 | " 0.000000 | \n",
430 | " 1.000000 | \n",
431 | " 0.000000 | \n",
432 | "
\n",
433 | " \n",
434 | " | 01e47a3d | \n",
435 | " 0 | \n",
436 | " 1 | \n",
437 | " 1 | \n",
438 | " 0.000000 | \n",
439 | " 1.000000 | \n",
440 | " 0.000000 | \n",
441 | "
\n",
442 | " \n",
443 | " | bcd5195e | \n",
444 | " 0 | \n",
445 | " 1 | \n",
446 | " 1 | \n",
447 | " 0.000000 | \n",
448 | " 1.000000 | \n",
449 | " 0.000000 | \n",
450 | "
\n",
451 | " \n",
452 | " | a318236b | \n",
453 | " 0 | \n",
454 | " 1 | \n",
455 | " 1 | \n",
456 | " 0.000000 | \n",
457 | " 1.000000 | \n",
458 | " 0.000000 | \n",
459 | "
\n",
460 | " \n",
461 | " | 1168ce02 | \n",
462 | " 0 | \n",
463 | " 1 | \n",
464 | " 1 | \n",
465 | " 0.000000 | \n",
466 | " 1.000000 | \n",
467 | " 0.000000 | \n",
468 | "
\n",
469 | " \n",
470 | " | 83c34e93 | \n",
471 | " 0 | \n",
472 | " 1 | \n",
473 | " 1 | \n",
474 | " 0.000000 | \n",
475 | " 1.000000 | \n",
476 | " 0.000000 | \n",
477 | "
\n",
478 | " \n",
479 | " | c8c31032 | \n",
480 | " 0 | \n",
481 | " 1 | \n",
482 | " 1 | \n",
483 | " 0.000000 | \n",
484 | " 1.000000 | \n",
485 | " 0.000000 | \n",
486 | "
\n",
487 | " \n",
488 | " | 004270bf | \n",
489 | " 0 | \n",
490 | " 1 | \n",
491 | " 1 | \n",
492 | " 0.000000 | \n",
493 | " 1.000000 | \n",
494 | " 0.000000 | \n",
495 | "
\n",
496 | " \n",
497 | " | e6d0facc | \n",
498 | " 0 | \n",
499 | " 1 | \n",
500 | " 1 | \n",
501 | " 0.000000 | \n",
502 | " 1.000000 | \n",
503 | " 0.000000 | \n",
504 | "
\n",
505 | " \n",
506 | " | 844524dc | \n",
507 | " 0 | \n",
508 | " 1 | \n",
509 | " 1 | \n",
510 | " 0.000000 | \n",
511 | " 1.000000 | \n",
512 | " 0.000000 | \n",
513 | "
\n",
514 | " \n",
515 | " | c90a30a1 | \n",
516 | " 0 | \n",
517 | " 1 | \n",
518 | " 1 | \n",
519 | " 0.000000 | \n",
520 | " 1.000000 | \n",
521 | " 0.000000 | \n",
522 | "
\n",
523 | " \n",
524 | " | 5015495b | \n",
525 | " 0 | \n",
526 | " 1 | \n",
527 | " 1 | \n",
528 | " 0.000000 | \n",
529 | " 1.000000 | \n",
530 | " 0.000000 | \n",
531 | "
\n",
532 | " \n",
533 | " | a12e8a45 | \n",
534 | " 0 | \n",
535 | " 1 | \n",
536 | " 1 | \n",
537 | " 0.000000 | \n",
538 | " 1.000000 | \n",
539 | " 0.000000 | \n",
540 | "
\n",
541 | " \n",
542 | " | 7b5c2c3b | \n",
543 | " 0 | \n",
544 | " 1 | \n",
545 | " 1 | \n",
546 | " 0.000000 | \n",
547 | " 1.000000 | \n",
548 | " 0.000000 | \n",
549 | "
\n",
550 | " \n",
551 | " | 4a6b5af3 | \n",
552 | " 0 | \n",
553 | " 1 | \n",
554 | " 1 | \n",
555 | " 0.000000 | \n",
556 | " 1.000000 | \n",
557 | " 0.000000 | \n",
558 | "
\n",
559 | " \n",
560 | " | 2c8ae68c | \n",
561 | " 0 | \n",
562 | " 1 | \n",
563 | " 1 | \n",
564 | " 0.000000 | \n",
565 | " 1.000000 | \n",
566 | " 0.000000 | \n",
567 | "
\n",
568 | " \n",
569 | " | 84565c92 | \n",
570 | " 0 | \n",
571 | " 1 | \n",
572 | " 1 | \n",
573 | " 0.000000 | \n",
574 | " 1.000000 | \n",
575 | " 0.000000 | \n",
576 | "
\n",
577 | " \n",
578 | " | 2a6fe2a5 | \n",
579 | " 0 | \n",
580 | " 1 | \n",
581 | " 1 | \n",
582 | " 0.000000 | \n",
583 | " 1.000000 | \n",
584 | " 0.000000 | \n",
585 | "
\n",
586 | " \n",
587 | " | e0f70006 | \n",
588 | " 0 | \n",
589 | " 1 | \n",
590 | " 1 | \n",
591 | " 0.000000 | \n",
592 | " 1.000000 | \n",
593 | " 0.000000 | \n",
594 | "
\n",
595 | " \n",
596 | " | cef4c8cc | \n",
597 | " 0 | \n",
598 | " 1 | \n",
599 | " 1 | \n",
600 | " 0.000000 | \n",
601 | " 1.000000 | \n",
602 | " 0.000000 | \n",
603 | "
\n",
604 | " \n",
605 | " | 7f4b1f1e | \n",
606 | " 0 | \n",
607 | " 1 | \n",
608 | " 1 | \n",
609 | " 0.000000 | \n",
610 | " 1.000000 | \n",
611 | " 0.000000 | \n",
612 | "
\n",
613 | " \n",
614 | " | 7efe14f0 | \n",
615 | " 0 | \n",
616 | " 1 | \n",
617 | " 1 | \n",
618 | " 0.000000 | \n",
619 | " 1.000000 | \n",
620 | " 0.000000 | \n",
621 | "
\n",
622 | " \n",
623 | " | 02b99e77 | \n",
624 | " 0 | \n",
625 | " 1 | \n",
626 | " 1 | \n",
627 | " 0.000000 | \n",
628 | " 1.000000 | \n",
629 | " 0.000000 | \n",
630 | "
\n",
631 | " \n",
632 | " | cbb50c1c | \n",
633 | " 0 | \n",
634 | " 1 | \n",
635 | " 1 | \n",
636 | " 0.000000 | \n",
637 | " 1.000000 | \n",
638 | " 0.000000 | \n",
639 | "
\n",
640 | " \n",
641 | "
\n",
642 | "
906 rows × 6 columns
\n",
643 | "
"
644 | ],
645 | "text/plain": [
646 | " clicks no_clicks total_clicks N+ N- log_N+\n",
647 | "a99f214a 1279 5878 7157 0.178706 0.821294 0.217591\n",
648 | "c357dbff 2 12 14 0.142857 0.857143 0.166667\n",
649 | "25635c83 2 0 2 1.000000 0.000000 inf\n",
650 | "e62f1261 2 1 3 0.666667 0.333333 2.000000\n",
651 | "135f7d9a 2 0 2 1.000000 0.000000 inf\n",
652 | "9af87478 2 0 2 1.000000 0.000000 inf\n",
653 | "77cf1a27 1 0 1 1.000000 0.000000 inf\n",
654 | "d62216cc 1 0 1 1.000000 0.000000 inf\n",
655 | "fcc5c7c0 1 0 1 1.000000 0.000000 inf\n",
656 | "7181509e 1 0 1 1.000000 0.000000 inf\n",
657 | "2a32a3ca 1 0 1 1.000000 0.000000 inf\n",
658 | "2ad16ba3 1 0 1 1.000000 0.000000 inf\n",
659 | "938f494b 1 0 1 1.000000 0.000000 inf\n",
660 | "24dbae83 1 0 1 1.000000 0.000000 inf\n",
661 | "ca9b95aa 1 0 1 1.000000 0.000000 inf\n",
662 | "7c369899 1 0 1 1.000000 0.000000 inf\n",
663 | "3bf8c26c 1 0 1 1.000000 0.000000 inf\n",
664 | "45378128 1 0 1 1.000000 0.000000 inf\n",
665 | "023ca1f9 1 0 1 1.000000 0.000000 inf\n",
666 | "3b9ab74d 1 0 1 1.000000 0.000000 inf\n",
667 | "9eb9a972 1 0 1 1.000000 0.000000 inf\n",
668 | "59bcd1ae 1 0 1 1.000000 0.000000 inf\n",
669 | "7f8c00b4 1 0 1 1.000000 0.000000 inf\n",
670 | "8d61d7eb 1 0 1 1.000000 0.000000 inf\n",
671 | "b441c41f 1 0 1 1.000000 0.000000 inf\n",
672 | "e317838f 1 0 1 1.000000 0.000000 inf\n",
673 | "2e7d4b65 1 0 1 1.000000 0.000000 inf\n",
674 | "70d97ece 1 0 1 1.000000 0.000000 inf\n",
675 | "9809e6c9 1 0 1 1.000000 0.000000 inf\n",
676 | "cb73ba55 1 0 1 1.000000 0.000000 inf\n",
677 | "... ... ... ... ... ... ...\n",
678 | "0c3bbac0 0 1 1 0.000000 1.000000 0.000000\n",
679 | "34c9f908 0 1 1 0.000000 1.000000 0.000000\n",
680 | "41a1ae5f 0 1 1 0.000000 1.000000 0.000000\n",
681 | "5d03585e 0 1 1 0.000000 1.000000 0.000000\n",
682 | "7d242dfd 0 1 1 0.000000 1.000000 0.000000\n",
683 | "9ffa0563 0 1 1 0.000000 1.000000 0.000000\n",
684 | "2fd58990 0 1 1 0.000000 1.000000 0.000000\n",
685 | "01e47a3d 0 1 1 0.000000 1.000000 0.000000\n",
686 | "bcd5195e 0 1 1 0.000000 1.000000 0.000000\n",
687 | "a318236b 0 1 1 0.000000 1.000000 0.000000\n",
688 | "1168ce02 0 1 1 0.000000 1.000000 0.000000\n",
689 | "83c34e93 0 1 1 0.000000 1.000000 0.000000\n",
690 | "c8c31032 0 1 1 0.000000 1.000000 0.000000\n",
691 | "004270bf 0 1 1 0.000000 1.000000 0.000000\n",
692 | "e6d0facc 0 1 1 0.000000 1.000000 0.000000\n",
693 | "844524dc 0 1 1 0.000000 1.000000 0.000000\n",
694 | "c90a30a1 0 1 1 0.000000 1.000000 0.000000\n",
695 | "5015495b 0 1 1 0.000000 1.000000 0.000000\n",
696 | "a12e8a45 0 1 1 0.000000 1.000000 0.000000\n",
697 | "7b5c2c3b 0 1 1 0.000000 1.000000 0.000000\n",
698 | "4a6b5af3 0 1 1 0.000000 1.000000 0.000000\n",
699 | "2c8ae68c 0 1 1 0.000000 1.000000 0.000000\n",
700 | "84565c92 0 1 1 0.000000 1.000000 0.000000\n",
701 | "2a6fe2a5 0 1 1 0.000000 1.000000 0.000000\n",
702 | "e0f70006 0 1 1 0.000000 1.000000 0.000000\n",
703 | "cef4c8cc 0 1 1 0.000000 1.000000 0.000000\n",
704 | "7f4b1f1e 0 1 1 0.000000 1.000000 0.000000\n",
705 | "7efe14f0 0 1 1 0.000000 1.000000 0.000000\n",
706 | "02b99e77 0 1 1 0.000000 1.000000 0.000000\n",
707 | "cbb50c1c 0 1 1 0.000000 1.000000 0.000000\n",
708 | "\n",
709 | "[906 rows x 6 columns]"
710 | ]
711 | },
712 | "execution_count": 12,
713 | "metadata": {},
714 | "output_type": "execute_result"
715 | }
716 | ],
717 | "source": [
718 | "def click_counting(x, bin_column):\n",
719 | " clicks = pd.Series(x[x['click'] > 0][bin_column].value_counts(), name='clicks')\n",
720 | " no_clicks = pd.Series(x[x['click'] < 1][bin_column].value_counts(), name='no_clicks')\n",
721 | "\n",
722 | " counts = pd.DataFrame([clicks,no_clicks]).T.fillna('0')\n",
723 | " counts['total_clicks'] = counts['clicks'].astype('int64') + counts['no_clicks'].astype('int64')\n",
724 | " return counts\n",
725 | "\n",
726 | "def bin_counting(counts):\n",
727 | " counts['N+'] = counts['clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))\n",
728 | " counts['N-'] = counts['no_clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))\n",
729 | " counts['log_N+'] = counts['N+'].divide(counts['N-'])\n",
730 | " # Bin Countingのプロパティを返すだけの場合、ここでフィルタリングを実行\n",
731 | " bin_counts = counts.filter(items= ['N+', 'N-', 'log_N+'])\n",
732 | " return counts, bin_counts\n",
733 | "\n",
734 | "# device_idを対象としたビンカウンティング\n",
735 | "bin_column = 'device_id'\n",
736 | "device_clicks = click_counting(df.filter(items=[bin_column, 'click']), bin_column)\n",
737 | "device_all, device_bin_counts = bin_counting(device_clicks)\n",
738 | "\n",
739 | "device_all"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": 13,
745 | "metadata": {},
746 | "outputs": [
747 | {
748 | "data": {
749 | "text/html": [
750 | "\n",
751 | "\n",
764 | "
\n",
765 | " \n",
766 | " \n",
767 | " | \n",
768 | " N+ | \n",
769 | " N- | \n",
770 | " log_N+ | \n",
771 | "
\n",
772 | " \n",
773 | " \n",
774 | " \n",
775 | " | a99f214a | \n",
776 | " 0.178706 | \n",
777 | " 0.821294 | \n",
778 | " 0.217591 | \n",
779 | "
\n",
780 | " \n",
781 | " | c357dbff | \n",
782 | " 0.142857 | \n",
783 | " 0.857143 | \n",
784 | " 0.166667 | \n",
785 | "
\n",
786 | " \n",
787 | " | 25635c83 | \n",
788 | " 1.000000 | \n",
789 | " 0.000000 | \n",
790 | " inf | \n",
791 | "
\n",
792 | " \n",
793 | " | e62f1261 | \n",
794 | " 0.666667 | \n",
795 | " 0.333333 | \n",
796 | " 2.000000 | \n",
797 | "
\n",
798 | " \n",
799 | " | 135f7d9a | \n",
800 | " 1.000000 | \n",
801 | " 0.000000 | \n",
802 | " inf | \n",
803 | "
\n",
804 | " \n",
805 | " | 9af87478 | \n",
806 | " 1.000000 | \n",
807 | " 0.000000 | \n",
808 | " inf | \n",
809 | "
\n",
810 | " \n",
811 | " | 77cf1a27 | \n",
812 | " 1.000000 | \n",
813 | " 0.000000 | \n",
814 | " inf | \n",
815 | "
\n",
816 | " \n",
817 | " | d62216cc | \n",
818 | " 1.000000 | \n",
819 | " 0.000000 | \n",
820 | " inf | \n",
821 | "
\n",
822 | " \n",
823 | " | fcc5c7c0 | \n",
824 | " 1.000000 | \n",
825 | " 0.000000 | \n",
826 | " inf | \n",
827 | "
\n",
828 | " \n",
829 | " | 7181509e | \n",
830 | " 1.000000 | \n",
831 | " 0.000000 | \n",
832 | " inf | \n",
833 | "
\n",
834 | " \n",
835 | " | 2a32a3ca | \n",
836 | " 1.000000 | \n",
837 | " 0.000000 | \n",
838 | " inf | \n",
839 | "
\n",
840 | " \n",
841 | " | 2ad16ba3 | \n",
842 | " 1.000000 | \n",
843 | " 0.000000 | \n",
844 | " inf | \n",
845 | "
\n",
846 | " \n",
847 | " | 938f494b | \n",
848 | " 1.000000 | \n",
849 | " 0.000000 | \n",
850 | " inf | \n",
851 | "
\n",
852 | " \n",
853 | " | 24dbae83 | \n",
854 | " 1.000000 | \n",
855 | " 0.000000 | \n",
856 | " inf | \n",
857 | "
\n",
858 | " \n",
859 | " | ca9b95aa | \n",
860 | " 1.000000 | \n",
861 | " 0.000000 | \n",
862 | " inf | \n",
863 | "
\n",
864 | " \n",
865 | " | 7c369899 | \n",
866 | " 1.000000 | \n",
867 | " 0.000000 | \n",
868 | " inf | \n",
869 | "
\n",
870 | " \n",
871 | " | 3bf8c26c | \n",
872 | " 1.000000 | \n",
873 | " 0.000000 | \n",
874 | " inf | \n",
875 | "
\n",
876 | " \n",
877 | " | 45378128 | \n",
878 | " 1.000000 | \n",
879 | " 0.000000 | \n",
880 | " inf | \n",
881 | "
\n",
882 | " \n",
883 | " | 023ca1f9 | \n",
884 | " 1.000000 | \n",
885 | " 0.000000 | \n",
886 | " inf | \n",
887 | "
\n",
888 | " \n",
889 | " | 3b9ab74d | \n",
890 | " 1.000000 | \n",
891 | " 0.000000 | \n",
892 | " inf | \n",
893 | "
\n",
894 | " \n",
895 | " | 9eb9a972 | \n",
896 | " 1.000000 | \n",
897 | " 0.000000 | \n",
898 | " inf | \n",
899 | "
\n",
900 | " \n",
901 | " | 59bcd1ae | \n",
902 | " 1.000000 | \n",
903 | " 0.000000 | \n",
904 | " inf | \n",
905 | "
\n",
906 | " \n",
907 | " | 7f8c00b4 | \n",
908 | " 1.000000 | \n",
909 | " 0.000000 | \n",
910 | " inf | \n",
911 | "
\n",
912 | " \n",
913 | " | 8d61d7eb | \n",
914 | " 1.000000 | \n",
915 | " 0.000000 | \n",
916 | " inf | \n",
917 | "
\n",
918 | " \n",
919 | " | b441c41f | \n",
920 | " 1.000000 | \n",
921 | " 0.000000 | \n",
922 | " inf | \n",
923 | "
\n",
924 | " \n",
925 | " | e317838f | \n",
926 | " 1.000000 | \n",
927 | " 0.000000 | \n",
928 | " inf | \n",
929 | "
\n",
930 | " \n",
931 | " | 2e7d4b65 | \n",
932 | " 1.000000 | \n",
933 | " 0.000000 | \n",
934 | " inf | \n",
935 | "
\n",
936 | " \n",
937 | " | 70d97ece | \n",
938 | " 1.000000 | \n",
939 | " 0.000000 | \n",
940 | " inf | \n",
941 | "
\n",
942 | " \n",
943 | " | 9809e6c9 | \n",
944 | " 1.000000 | \n",
945 | " 0.000000 | \n",
946 | " inf | \n",
947 | "
\n",
948 | " \n",
949 | " | cb73ba55 | \n",
950 | " 1.000000 | \n",
951 | " 0.000000 | \n",
952 | " inf | \n",
953 | "
\n",
954 | " \n",
955 | " | ... | \n",
956 | " ... | \n",
957 | " ... | \n",
958 | " ... | \n",
959 | "
\n",
960 | " \n",
961 | " | 0c3bbac0 | \n",
962 | " 0.000000 | \n",
963 | " 1.000000 | \n",
964 | " 0.000000 | \n",
965 | "
\n",
966 | " \n",
967 | " | 34c9f908 | \n",
968 | " 0.000000 | \n",
969 | " 1.000000 | \n",
970 | " 0.000000 | \n",
971 | "
\n",
972 | " \n",
973 | " | 41a1ae5f | \n",
974 | " 0.000000 | \n",
975 | " 1.000000 | \n",
976 | " 0.000000 | \n",
977 | "
\n",
978 | " \n",
979 | " | 5d03585e | \n",
980 | " 0.000000 | \n",
981 | " 1.000000 | \n",
982 | " 0.000000 | \n",
983 | "
\n",
984 | " \n",
985 | " | 7d242dfd | \n",
986 | " 0.000000 | \n",
987 | " 1.000000 | \n",
988 | " 0.000000 | \n",
989 | "
\n",
990 | " \n",
991 | " | 9ffa0563 | \n",
992 | " 0.000000 | \n",
993 | " 1.000000 | \n",
994 | " 0.000000 | \n",
995 | "
\n",
996 | " \n",
997 | " | 2fd58990 | \n",
998 | " 0.000000 | \n",
999 | " 1.000000 | \n",
1000 | " 0.000000 | \n",
1001 | "
\n",
1002 | " \n",
1003 | " | 01e47a3d | \n",
1004 | " 0.000000 | \n",
1005 | " 1.000000 | \n",
1006 | " 0.000000 | \n",
1007 | "
\n",
1008 | " \n",
1009 | " | bcd5195e | \n",
1010 | " 0.000000 | \n",
1011 | " 1.000000 | \n",
1012 | " 0.000000 | \n",
1013 | "
\n",
1014 | " \n",
1015 | " | a318236b | \n",
1016 | " 0.000000 | \n",
1017 | " 1.000000 | \n",
1018 | " 0.000000 | \n",
1019 | "
\n",
1020 | " \n",
1021 | " | 1168ce02 | \n",
1022 | " 0.000000 | \n",
1023 | " 1.000000 | \n",
1024 | " 0.000000 | \n",
1025 | "
\n",
1026 | " \n",
1027 | " | 83c34e93 | \n",
1028 | " 0.000000 | \n",
1029 | " 1.000000 | \n",
1030 | " 0.000000 | \n",
1031 | "
\n",
1032 | " \n",
1033 | " | c8c31032 | \n",
1034 | " 0.000000 | \n",
1035 | " 1.000000 | \n",
1036 | " 0.000000 | \n",
1037 | "
\n",
1038 | " \n",
1039 | " | 004270bf | \n",
1040 | " 0.000000 | \n",
1041 | " 1.000000 | \n",
1042 | " 0.000000 | \n",
1043 | "
\n",
1044 | " \n",
1045 | " | e6d0facc | \n",
1046 | " 0.000000 | \n",
1047 | " 1.000000 | \n",
1048 | " 0.000000 | \n",
1049 | "
\n",
1050 | " \n",
1051 | " | 844524dc | \n",
1052 | " 0.000000 | \n",
1053 | " 1.000000 | \n",
1054 | " 0.000000 | \n",
1055 | "
\n",
1056 | " \n",
1057 | " | c90a30a1 | \n",
1058 | " 0.000000 | \n",
1059 | " 1.000000 | \n",
1060 | " 0.000000 | \n",
1061 | "
\n",
1062 | " \n",
1063 | " | 5015495b | \n",
1064 | " 0.000000 | \n",
1065 | " 1.000000 | \n",
1066 | " 0.000000 | \n",
1067 | "
\n",
1068 | " \n",
1069 | " | a12e8a45 | \n",
1070 | " 0.000000 | \n",
1071 | " 1.000000 | \n",
1072 | " 0.000000 | \n",
1073 | "
\n",
1074 | " \n",
1075 | " | 7b5c2c3b | \n",
1076 | " 0.000000 | \n",
1077 | " 1.000000 | \n",
1078 | " 0.000000 | \n",
1079 | "
\n",
1080 | " \n",
1081 | " | 4a6b5af3 | \n",
1082 | " 0.000000 | \n",
1083 | " 1.000000 | \n",
1084 | " 0.000000 | \n",
1085 | "
\n",
1086 | " \n",
1087 | " | 2c8ae68c | \n",
1088 | " 0.000000 | \n",
1089 | " 1.000000 | \n",
1090 | " 0.000000 | \n",
1091 | "
\n",
1092 | " \n",
1093 | " | 84565c92 | \n",
1094 | " 0.000000 | \n",
1095 | " 1.000000 | \n",
1096 | " 0.000000 | \n",
1097 | "
\n",
1098 | " \n",
1099 | " | 2a6fe2a5 | \n",
1100 | " 0.000000 | \n",
1101 | " 1.000000 | \n",
1102 | " 0.000000 | \n",
1103 | "
\n",
1104 | " \n",
1105 | " | e0f70006 | \n",
1106 | " 0.000000 | \n",
1107 | " 1.000000 | \n",
1108 | " 0.000000 | \n",
1109 | "
\n",
1110 | " \n",
1111 | " | cef4c8cc | \n",
1112 | " 0.000000 | \n",
1113 | " 1.000000 | \n",
1114 | " 0.000000 | \n",
1115 | "
\n",
1116 | " \n",
1117 | " | 7f4b1f1e | \n",
1118 | " 0.000000 | \n",
1119 | " 1.000000 | \n",
1120 | " 0.000000 | \n",
1121 | "
\n",
1122 | " \n",
1123 | " | 7efe14f0 | \n",
1124 | " 0.000000 | \n",
1125 | " 1.000000 | \n",
1126 | " 0.000000 | \n",
1127 | "
\n",
1128 | " \n",
1129 | " | 02b99e77 | \n",
1130 | " 0.000000 | \n",
1131 | " 1.000000 | \n",
1132 | " 0.000000 | \n",
1133 | "
\n",
1134 | " \n",
1135 | " | cbb50c1c | \n",
1136 | " 0.000000 | \n",
1137 | " 1.000000 | \n",
1138 | " 0.000000 | \n",
1139 | "
\n",
1140 | " \n",
1141 | "
\n",
1142 | "
906 rows × 3 columns
\n",
1143 | "
"
1144 | ],
1145 | "text/plain": [
1146 | " N+ N- log_N+\n",
1147 | "a99f214a 0.178706 0.821294 0.217591\n",
1148 | "c357dbff 0.142857 0.857143 0.166667\n",
1149 | "25635c83 1.000000 0.000000 inf\n",
1150 | "e62f1261 0.666667 0.333333 2.000000\n",
1151 | "135f7d9a 1.000000 0.000000 inf\n",
1152 | "9af87478 1.000000 0.000000 inf\n",
1153 | "77cf1a27 1.000000 0.000000 inf\n",
1154 | "d62216cc 1.000000 0.000000 inf\n",
1155 | "fcc5c7c0 1.000000 0.000000 inf\n",
1156 | "7181509e 1.000000 0.000000 inf\n",
1157 | "2a32a3ca 1.000000 0.000000 inf\n",
1158 | "2ad16ba3 1.000000 0.000000 inf\n",
1159 | "938f494b 1.000000 0.000000 inf\n",
1160 | "24dbae83 1.000000 0.000000 inf\n",
1161 | "ca9b95aa 1.000000 0.000000 inf\n",
1162 | "7c369899 1.000000 0.000000 inf\n",
1163 | "3bf8c26c 1.000000 0.000000 inf\n",
1164 | "45378128 1.000000 0.000000 inf\n",
1165 | "023ca1f9 1.000000 0.000000 inf\n",
1166 | "3b9ab74d 1.000000 0.000000 inf\n",
1167 | "9eb9a972 1.000000 0.000000 inf\n",
1168 | "59bcd1ae 1.000000 0.000000 inf\n",
1169 | "7f8c00b4 1.000000 0.000000 inf\n",
1170 | "8d61d7eb 1.000000 0.000000 inf\n",
1171 | "b441c41f 1.000000 0.000000 inf\n",
1172 | "e317838f 1.000000 0.000000 inf\n",
1173 | "2e7d4b65 1.000000 0.000000 inf\n",
1174 | "70d97ece 1.000000 0.000000 inf\n",
1175 | "9809e6c9 1.000000 0.000000 inf\n",
1176 | "cb73ba55 1.000000 0.000000 inf\n",
1177 | "... ... ... ...\n",
1178 | "0c3bbac0 0.000000 1.000000 0.000000\n",
1179 | "34c9f908 0.000000 1.000000 0.000000\n",
1180 | "41a1ae5f 0.000000 1.000000 0.000000\n",
1181 | "5d03585e 0.000000 1.000000 0.000000\n",
1182 | "7d242dfd 0.000000 1.000000 0.000000\n",
1183 | "9ffa0563 0.000000 1.000000 0.000000\n",
1184 | "2fd58990 0.000000 1.000000 0.000000\n",
1185 | "01e47a3d 0.000000 1.000000 0.000000\n",
1186 | "bcd5195e 0.000000 1.000000 0.000000\n",
1187 | "a318236b 0.000000 1.000000 0.000000\n",
1188 | "1168ce02 0.000000 1.000000 0.000000\n",
1189 | "83c34e93 0.000000 1.000000 0.000000\n",
1190 | "c8c31032 0.000000 1.000000 0.000000\n",
1191 | "004270bf 0.000000 1.000000 0.000000\n",
1192 | "e6d0facc 0.000000 1.000000 0.000000\n",
1193 | "844524dc 0.000000 1.000000 0.000000\n",
1194 | "c90a30a1 0.000000 1.000000 0.000000\n",
1195 | "5015495b 0.000000 1.000000 0.000000\n",
1196 | "a12e8a45 0.000000 1.000000 0.000000\n",
1197 | "7b5c2c3b 0.000000 1.000000 0.000000\n",
1198 | "4a6b5af3 0.000000 1.000000 0.000000\n",
1199 | "2c8ae68c 0.000000 1.000000 0.000000\n",
1200 | "84565c92 0.000000 1.000000 0.000000\n",
1201 | "2a6fe2a5 0.000000 1.000000 0.000000\n",
1202 | "e0f70006 0.000000 1.000000 0.000000\n",
1203 | "cef4c8cc 0.000000 1.000000 0.000000\n",
1204 | "7f4b1f1e 0.000000 1.000000 0.000000\n",
1205 | "7efe14f0 0.000000 1.000000 0.000000\n",
1206 | "02b99e77 0.000000 1.000000 0.000000\n",
1207 | "cbb50c1c 0.000000 1.000000 0.000000\n",
1208 | "\n",
1209 | "[906 rows x 3 columns]"
1210 | ]
1211 | },
1212 | "execution_count": 13,
1213 | "metadata": {},
1214 | "output_type": "execute_result"
1215 | }
1216 | ],
1217 | "source": [
1218 | "device_bin_counts"
1219 | ]
1220 | },
1221 | {
1222 | "cell_type": "code",
1223 | "execution_count": 14,
1224 | "metadata": {},
1225 | "outputs": [
1226 | {
1227 | "data": {
1228 | "text/plain": [
1229 | "906"
1230 | ]
1231 | },
1232 | "execution_count": 14,
1233 | "metadata": {},
1234 | "output_type": "execute_result"
1235 | }
1236 | ],
1237 | "source": [
1238 | "len(device_bin_counts)"
1239 | ]
1240 | },
1241 | {
1242 | "cell_type": "code",
1243 | "execution_count": 15,
1244 | "metadata": {},
1245 | "outputs": [
1246 | {
1247 | "data": {
1248 | "text/html": [
1249 | "\n",
1250 | "\n",
1263 | "
\n",
1264 | " \n",
1265 | " \n",
1266 | " | \n",
1267 | " clicks | \n",
1268 | " no_clicks | \n",
1269 | " total_clicks | \n",
1270 | " N+ | \n",
1271 | " N- | \n",
1272 | " log_N+ | \n",
1273 | "
\n",
1274 | " \n",
1275 | " \n",
1276 | " \n",
1277 | " | a99f214a | \n",
1278 | " 1279 | \n",
1279 | " 5878 | \n",
1280 | " 7157 | \n",
1281 | " 0.178706 | \n",
1282 | " 0.821294 | \n",
1283 | " 0.217591 | \n",
1284 | "
\n",
1285 | " \n",
1286 | " | c357dbff | \n",
1287 | " 2 | \n",
1288 | " 12 | \n",
1289 | " 14 | \n",
1290 | " 0.142857 | \n",
1291 | " 0.857143 | \n",
1292 | " 0.166667 | \n",
1293 | "
\n",
1294 | " \n",
1295 | " | a167aa83 | \n",
1296 | " 0 | \n",
1297 | " 7 | \n",
1298 | " 7 | \n",
1299 | " 0.000000 | \n",
1300 | " 1.000000 | \n",
1301 | " 0.000000 | \n",
1302 | "
\n",
1303 | " \n",
1304 | " | d2bbb640 | \n",
1305 | " 0 | \n",
1306 | " 6 | \n",
1307 | " 6 | \n",
1308 | " 0.000000 | \n",
1309 | " 1.000000 | \n",
1310 | " 0.000000 | \n",
1311 | "
\n",
1312 | " \n",
1313 | "
\n",
1314 | "
"
1315 | ],
1316 | "text/plain": [
1317 | " clicks no_clicks total_clicks N+ N- log_N+\n",
1318 | "a99f214a 1279 5878 7157 0.178706 0.821294 0.217591\n",
1319 | "c357dbff 2 12 14 0.142857 0.857143 0.166667\n",
1320 | "a167aa83 0 7 7 0.000000 1.000000 0.000000\n",
1321 | "d2bbb640 0 6 6 0.000000 1.000000 0.000000"
1322 | ]
1323 | },
1324 | "execution_count": 15,
1325 | "metadata": {},
1326 | "output_type": "execute_result"
1327 | }
1328 | ],
1329 | "source": [
1330 | "device_all.sort_values(by = 'total_clicks', ascending=False).head(4)"
1331 | ]
1332 | },
1333 | {
1334 | "cell_type": "code",
1335 | "execution_count": null,
1336 | "metadata": {},
1337 | "outputs": [],
1338 | "source": []
1339 | }
1340 | ],
1341 | "metadata": {
1342 | "kernelspec": {
1343 | "display_name": "Python 3",
1344 | "language": "python",
1345 | "name": "python3"
1346 | },
1347 | "language_info": {
1348 | "codemirror_mode": {
1349 | "name": "ipython",
1350 | "version": 3
1351 | },
1352 | "file_extension": ".py",
1353 | "mimetype": "text/x-python",
1354 | "name": "python",
1355 | "nbconvert_exporter": "python",
1356 | "pygments_lexer": "ipython3",
1357 | "version": "3.7.0"
1358 | }
1359 | },
1360 | "nbformat": 4,
1361 | "nbformat_minor": 2
1362 | }
1363 |
--------------------------------------------------------------------------------
/2-2-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 例2-2. Yelp データセット内の店舗に対するレビュー件数の可視化"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 7,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/html": [
18 | "\n",
19 | "\n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " | \n",
36 | " business_id | \n",
37 | " categories | \n",
38 | " city | \n",
39 | " full_address | \n",
40 | " latitude | \n",
41 | " longitude | \n",
42 | " name | \n",
43 | " neighborhoods | \n",
44 | " open | \n",
45 | " review_count | \n",
46 | " stars | \n",
47 | " state | \n",
48 | " type | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " | 0 | \n",
54 | " rncjoVoEFUJGCUoC1JgnUA | \n",
55 | " [Accountants, Professional Services, Tax Servi... | \n",
56 | " Peoria | \n",
57 | " 8466 W Peoria Ave\\nSte 6\\nPeoria, AZ 85345 | \n",
58 | " 33.581867 | \n",
59 | " -112.241596 | \n",
60 | " Peoria Income Tax Service | \n",
61 | " [] | \n",
62 | " True | \n",
63 | " 3 | \n",
64 | " 5.0 | \n",
65 | " AZ | \n",
66 | " business | \n",
67 | "
\n",
68 | " \n",
69 | " | 1 | \n",
70 | " 0FNFSzCFP_rGUoJx8W7tJg | \n",
71 | " [Sporting Goods, Bikes, Shopping] | \n",
72 | " Phoenix | \n",
73 | " 2149 W Wood Dr\\nPhoenix, AZ 85029 | \n",
74 | " 33.604054 | \n",
75 | " -112.105933 | \n",
76 | " Bike Doctor | \n",
77 | " [] | \n",
78 | " True | \n",
79 | " 5 | \n",
80 | " 5.0 | \n",
81 | " AZ | \n",
82 | " business | \n",
83 | "
\n",
84 | " \n",
85 | " | 2 | \n",
86 | " 3f_lyB6vFK48ukH6ScvLHg | \n",
87 | " [] | \n",
88 | " Phoenix | \n",
89 | " 1134 N Central Ave\\nPhoenix, AZ 85004 | \n",
90 | " 33.460526 | \n",
91 | " -112.073933 | \n",
92 | " Valley Permaculture Alliance | \n",
93 | " [] | \n",
94 | " True | \n",
95 | " 4 | \n",
96 | " 5.0 | \n",
97 | " AZ | \n",
98 | " business | \n",
99 | "
\n",
100 | " \n",
101 | " | 3 | \n",
102 | " usAsSV36QmUej8--yvN-dg | \n",
103 | " [Food, Grocery] | \n",
104 | " Phoenix | \n",
105 | " 845 W Southern Ave\\nPhoenix, AZ 85041 | \n",
106 | " 33.392210 | \n",
107 | " -112.085377 | \n",
108 | " Food City | \n",
109 | " [] | \n",
110 | " True | \n",
111 | " 5 | \n",
112 | " 3.5 | \n",
113 | " AZ | \n",
114 | " business | \n",
115 | "
\n",
116 | " \n",
117 | " | 4 | \n",
118 | " PzOqRohWw7F7YEPBz6AubA | \n",
119 | " [Food, Bagels, Delis, Restaurants] | \n",
120 | " Glendale Az | \n",
121 | " 6520 W Happy Valley Rd\\nSte 101\\nGlendale Az, ... | \n",
122 | " 33.712797 | \n",
123 | " -112.200264 | \n",
124 | " Hot Bagels & Deli | \n",
125 | " [] | \n",
126 | " True | \n",
127 | " 14 | \n",
128 | " 3.5 | \n",
129 | " AZ | \n",
130 | " business | \n",
131 | "
\n",
132 | " \n",
133 | "
\n",
134 | "
"
135 | ],
136 | "text/plain": [
137 | " business_id categories \\\n",
138 | "0 rncjoVoEFUJGCUoC1JgnUA [Accountants, Professional Services, Tax Servi... \n",
139 | "1 0FNFSzCFP_rGUoJx8W7tJg [Sporting Goods, Bikes, Shopping] \n",
140 | "2 3f_lyB6vFK48ukH6ScvLHg [] \n",
141 | "3 usAsSV36QmUej8--yvN-dg [Food, Grocery] \n",
142 | "4 PzOqRohWw7F7YEPBz6AubA [Food, Bagels, Delis, Restaurants] \n",
143 | "\n",
144 | " city full_address latitude \\\n",
145 | "0 Peoria 8466 W Peoria Ave\\nSte 6\\nPeoria, AZ 85345 33.581867 \n",
146 | "1 Phoenix 2149 W Wood Dr\\nPhoenix, AZ 85029 33.604054 \n",
147 | "2 Phoenix 1134 N Central Ave\\nPhoenix, AZ 85004 33.460526 \n",
148 | "3 Phoenix 845 W Southern Ave\\nPhoenix, AZ 85041 33.392210 \n",
149 | "4 Glendale Az 6520 W Happy Valley Rd\\nSte 101\\nGlendale Az, ... 33.712797 \n",
150 | "\n",
151 | " longitude name neighborhoods open review_count \\\n",
152 | "0 -112.241596 Peoria Income Tax Service [] True 3 \n",
153 | "1 -112.105933 Bike Doctor [] True 5 \n",
154 | "2 -112.073933 Valley Permaculture Alliance [] True 4 \n",
155 | "3 -112.085377 Food City [] True 5 \n",
156 | "4 -112.200264 Hot Bagels & Deli [] True 14 \n",
157 | "\n",
158 | " stars state type \n",
159 | "0 5.0 AZ business \n",
160 | "1 5.0 AZ business \n",
161 | "2 5.0 AZ business \n",
162 | "3 3.5 AZ business \n",
163 | "4 3.5 AZ business "
164 | ]
165 | },
166 | "execution_count": 7,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "import pandas as pd\n",
173 | "import json\n",
174 | "\n",
175 | "# 店舗についてのデータを読み込む\n",
176 | "with open('data/yelp/yelp_academic_dataset_business.json') as biz_file:\n",
177 | " biz_df = pd.DataFrame([json.loads(x) for x in biz_file.readlines()])\n",
178 | "\n",
179 | "biz_df.head()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 8,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | "Text(0, 0.5, 'Occurrence')"
191 | ]
192 | },
193 | "execution_count": 8,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | },
197 | {
198 | "data": {
199 | "image/png": "\n",
200 | "text/plain": [
201 | ""
202 | ]
203 | },
204 | "metadata": {},
205 | "output_type": "display_data"
206 | }
207 | ],
208 | "source": [
209 | "%matplotlib inline\n",
210 | "import matplotlib.pyplot as plt\n",
211 | "import seaborn as sns\n",
212 | "\n",
213 | "# レビュー件数のヒストグラムを描画\n",
214 | "sns.set_style('whitegrid')\n",
215 | "fig, ax = plt.subplots()\n",
216 | "biz_df['review_count'].hist(ax=ax, bins=100)\n",
217 | "ax.set_yscale('log')\n",
218 | "ax.tick_params(labelsize=14)\n",
219 | "ax.set_xlabel('Review Count', fontsize=14)\n",
220 | "ax.set_ylabel('Occurrence', fontsize=14)"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {},
226 | "source": [
227 | "## 例 2-3. 固定幅によるカウントの離散化"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 9,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "array([37, 12, 72, 9, 75, 5, 79, 64, 16, 1, 76, 71, 6, 25, 50, 20, 18,\n",
239 | " 84, 11, 28])"
240 | ]
241 | },
242 | "execution_count": 9,
243 | "metadata": {},
244 | "output_type": "execute_result"
245 | }
246 | ],
247 | "source": [
248 | "import numpy as np\n",
249 | "\n",
250 | "np.random.seed(seed=1)\n",
251 | "\n",
252 | "# 0から99までの整数を一様分布からランダムに20個生成する\n",
253 | "small_counts = np.random.randint(0, 100, 20)\n",
254 | "\n",
255 | "small_counts"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 10,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "data": {
265 | "text/plain": [
266 | "array([3, 1, 7, 0, 7, 0, 7, 6, 1, 0, 7, 7, 0, 2, 5, 2, 1, 8, 1, 2])"
267 | ]
268 | },
269 | "execution_count": 10,
270 | "metadata": {},
271 | "output_type": "execute_result"
272 | }
273 | ],
274 | "source": [
275 | "# 除算により 0-9 までの階級を割り当てる\n",
276 | "np.floor_divide(small_counts, 10)"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 11,
282 | "metadata": {},
283 | "outputs": [
284 | {
285 | "data": {
286 | "text/plain": [
287 | "array([2., 3., 4., 1., 0., 2., 2., 3., 3., 4., 4., 1., 1., 3., 2., 2., 4.])"
288 | ]
289 | },
290 | "execution_count": 11,
291 | "metadata": {},
292 | "output_type": "execute_result"
293 | }
294 | ],
295 | "source": [
296 | "# 複数の桁にまたがるカウントデータの配列\n",
297 | "large_counts = [296, 8286, 64011, 80, 3, 725, 867, 2215, 7689, 11495, 91897, \n",
298 | " 44, 28, 7971, 926, 122, 22222]\n",
299 | "# 対数変換により指数幅の階級を割り当てる\n",
300 | "np.floor(np.log10(large_counts))"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "## 例 2-4. Yelp ビジネスレビュー件数の十分位数を計算する"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 12,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "data": {
317 | "text/plain": [
318 | "0.1 3.0\n",
319 | "0.2 3.0\n",
320 | "0.3 4.0\n",
321 | "0.4 5.0\n",
322 | "0.5 6.0\n",
323 | "0.6 8.0\n",
324 | "0.7 12.0\n",
325 | "0.8 23.0\n",
326 | "0.9 50.0\n",
327 | "Name: review_count, dtype: float64"
328 | ]
329 | },
330 | "execution_count": 12,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "deciles = biz_df['review_count'].quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])\n",
337 | "deciles"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 13,
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "data": {
347 | "text/plain": [
348 | "Text(0, 0.5, 'Occurrence')"
349 | ]
350 | },
351 | "execution_count": 13,
352 | "metadata": {},
353 | "output_type": "execute_result"
354 | },
355 | {
356 | "data": {
357 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEWCAYAAACufwpNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3XtclGXeP/APjpzEJOSgQahFMuK8RAdRVBQ2TTPSBNrMFSSlaMmsfKRHRMPKNkUzS7FsbV3ddhGfMkN5pN19XKvVJgVXfFIc/ZknsPFhAFGU08DM/P4wZhmHwz1wzwk+79erlzvXffoyt/LZ67rug5Ner9eDiIhIBH1sXQAREfUcDBUiIhINQ4WIiETDUCEiItEwVIiISDQMFSIiEg1DhYiIRMNQISIi0ThkqNTX1+PRRx/F+vXrbV0KERG10tfWBXTFJ598gtGjR5u1zalTp+Dq6mr+wYqL7/4pl5u/7fnzd/+USsVd39z9irB9Y2Njx99fd2sSkz3V0hE7rbPTc009Smfnu7GxEWPGjBG8P4cLlStXruDSpUt49NFHceHCBcHbubq6IiQkxPwDTpx498+bN83f9qWX7v757bfirm/ufkXYXqlUdvz9dbcmMdlTLR2x0zo7PdfUo3R2vpVKpVn7s+rwV1FREVJTUzFlyhRIpVLs27fPZJ2cnBxMnToVo0aNQnx8PE6cOGG0fP369Vi2bJm1SiYiIjNYNVTq6uoQHByMVatWwc3NzWR5QUEB1q5di9TUVOTl5UEulyMlJQUqlQoAcOjQIQwbNgwPPfSQNcsmIiKBrDr8FR0djejoaABARkaGyfKdO3ciLi4Oc+fOBQBkZmbiyJEjyM3NRVpaGv73f/8XBQUF+Nvf/oba2lo0NzfDw8MDS5Ys6fTYjY2NZnfjACBYpwMA/L8ubDukrg4AUCpwW6Hrm7tfMbZvaGjo8Pvrbk1isqdaOmKvdXZ2rqlnEft8282cikajQUlJCZKTk43aIyMjUfzLZHlaWhrS0tIAAPv27cOFCxcEBQrQjTmVPnc7c13atl8/87YVur65+xVh+07H2btbk5jsqZaO2GmdnFPpXRx6TqUj1dXV0Gq18PHxMWr39vZGRUWFjaoiIiJz2E1PxVzx8fG2LoGIuqGmpgZqtRpNTU22LqXXcnZ2hlarFXWfdhMqXl5ekEgkqKysNGqvqqqCr6+vjaoiIkuoqalBeXk5AgIC4O7uDicnJ1uX1Ovo9XrU19fj8uXLqKmpwYABA0TZr90Mf7m4uEAmk0GhUBi1KxQKyLty4yER2S21Wo2AgAD069ePgWIjTk5O6NevH/z9/aFWq0Xbr1V7KrW1tSgtLQUA6HQ6qFQqKJVKeHp6wt/fH4sWLcLy5csRGhqKsLAw5ObmQq1WY968edYs04QeQG//a9/ZxK1Or0cf/nIggZqamuDu7m7rMgh3L2IScwjSqqFy5swZJCUlGT5nZ2cjOzsbcXFxyMrKQkxMDKqrq7Ft2zao1WoEBwdj+/btCAgIsGaZJpwADFtx0Ozt9lyqAgDME7it0PXN3a/Y27flCgOFzMQein0Q+zxYNVQiIiJwvuV5R+1ISEhAQkKClSoiIiIx2c2cChGRo/rtb3+LFStWiLKv7OxszJo1q93P9s5urv4iIiIgOTkZiYmJti6jyxgqRER2xMPDAx4eHrYuo8s4/EVEZIb6+nqsWLECcrkckyZNwieffGK0XKPR4L333kNUVBRGjx6Np59+GkeOHDFa5+LFi0hNTcXYsWMhl8vx7LPPGuabhQx3ffnll4iJicGoUaPw+OOPY9euXdD98pxCANizZw8ef/xxjBo1ChEREXj++efR3Nws0jfQMfZUiMg+fPYZ8Mc/WveYyclAqytShVi/fj2+//57bNmyBYMGDcLWrVtRVFSEGTNmALj7sNyysjK8//77GDx4ML777ju89NJL2Lt3L0aMGIHy8nLMnz8fYWFh2LlzJ+677z78+OOPRqHQkc8//xxbtmzBG2+8AZlMhgsXLiAzMxN9+/ZFYmIiTp8+jTVr1iArKwtjx47F7du3cezYMbO/mq5iqBARCVRbW4u9e/di7dq1mDJlCgBg3bp1hqevl5aW4uDBgzh8+DD8/f0BAImJiVAoFNizZw/eeust7N69G/369cPmzZvh4uICAGa9zuPjjz/G66+/jpkzZwIAAgMDUVpait27dyMxMRHXr1+Hu7s7pk6div79+wMARowYIdp30BmGChHZh6Qks3sN1lZWVoampiajp3x4eHggODgYAFBSUgK9Xo8nn3zSaDuNRoMJEyYAAM6ePYuwsDBDoJjjxo0buH79Ot588028/fbbhvbm5mbo9XoAwKRJk+Dv749p06Zh8uTJmDx5MqZPn24IGEtjqBARiUSv18PJyQl79+5F377Gv17bejGhuVqGyN5+++12H1/Vv39/fPXVVygqKoJCocDvf/97bNq0CXv37sWgQYO6XUNnOFFPRCRQYGAgnJ2dcerUKUNbXV0dLly4AODu44z0ej0qKiowdOhQo/9afqGPHDkSJ0+ehEajMfv4Pj4+8PPzQ2lpqcn+hw4dalivb9++mDhxItLS0nDgwAHU19fj22+/7d4PLxB7KkREAnl4eODpp5/Gxo0bMXDgQPj5+eGjjz4yPD7+oYcewuzZs5GRkYH09HTIZDLcvHkThYWFCAwMxIwZMzB//nzs2bMHS5cuRWpqKjw9PXH69GkEBQUJejnaq6++infeeQcDBgxAVFQUmpubcfbsWZSXl+O3v/0tvvnmG5SWlmLcuHHw9PTE8ePHUVtbi6CgIEt/PQAYKkREZklPT0d9fT2WLFkCNzc3JCYmor6+3rB83bp1+OSTT/Dee++hvLwcnp6ehkt7AWDQoEH4y1/+gg0bNuC5554DAEilUqxZs0bQ8Z955hm4u7tjx44deP/99+Hm5oZHHnnEcMPkfffdh0OHDuHjjz9GfX09hgwZgt/97ncIDw8X+ZtoG0OFiMgM/fr1w4YNG9pd7uzsjFdeeQWvvPJKu+sMHz4cn376aZvL7t22rX3NmjWr3XtZwsPD8ec//7mjH8GiOKdCRESiYagQEZFoGCpERCQahgoREYmGoUJENtFyBzjZltjngaFCRFbn7OxsdBku2U5jYyOcnZ1F2x9DhYiszs/PDz///DPq6urYY7ERvV6Puro6qFQq+Pn5ibZf3qdCRFY3YMAAAIBKpUJTU5ONq+m9nJ2dodfrDedDDAwVIrKJAQMGiPrLjLpGqVSKuj8OfxERkWgYKkREJBqGChERiYahQkREomGoEBGRaBgqREQkGoYKERGJhqFCRESiYagQEZFoGCpERCQahgoREYmGoUJERKJhqBARkWgYKkREJBqGChERiYahQkREonGol3TV1NRg4cKF0Gq10Gq1SEpKwty5c21dFhER/cKhQsXDwwM5OTlwd3dHXV0dZs2ahenTp8PLy8vWpRERERxs+EsikcDd3R0AoNFoAAB6vd6WJdEvdDY4Dw1NWqsfk4g6ZtWeSlFREXbs2IGSkhKo1WqsW7cO8fHxRuvk5ORgx44dqKiowPDhw7Fy5UqEh4cbltfU1CAxMRFXr17F8uXLMXDgQGv+CNSOPk5OOHapCvNWHLTaMa9kPWm1YxGRMFbtqdTV1SE4OBirVq2Cm5ubyfKCggKsXbsWqampyMvLg1wuR0pKClQqlWGdAQMG4MCBA/jHP/6B/Px8VFZWWvNHICKiDli1pxIdHY3o6GgAQEZGhsnynTt3Ii4uzjD5npmZiSNHjiA3NxdpaWlG6/r4+GDEiBE4ceIEZs6c2emxGxsboVQqza45WKeDpI9DjRL2Km2d0yF1dQCA0i6cb2uy1zobGhq69G+FHJPY59tuJuo1Gg1KSkqQnJxs1B4ZGYni4mIAQGVlJdzc3NC/f3/cvn0bJ06cwG9+8xtB+3d1dUVISIj5hTFQ7Fqb57Rfv/aX2RM7rVOpVNpdTWQ5nZ1vcwPHbkKluroaWq0WPj4+Ru3e3t5QKBQAAJVKhczMTOj1euj1eiQmJkIqldqiXCIiaoPdhIoQoaGh2L9/v63LICKidtjN2I6XlxckEonJxHtVVRV8fX1tVBUREZnDbkLFxcUFMpnMMNTVQqFQQC6X26gqIiIyh1WHv2pra1FaWgoA0Ol0UKlUUCqV8PT0hL+/PxYtWoTly5cjNDQUYWFhyM3NhVqtxrx586xZJhERdZFVQ+XMmTNISkoyfM7OzkZ2djbi4uKQlZWFmJgYVFdXY9u2bVCr1QgODsb27dsREBBgzTKJiKiLrBoqEREROH/+fIfrJCQkICEhwUoVERGRmOxmToWIiBwfQ4WIiETDUCEiItEwVIiISDQMFSIiEg1DhRyWmC/p4gu/iMThUM/+ImrNzVmCYW28FGzPpSoAMOuFYXzhF5E42FMhIiLRmBUq58+fx5o1a/DCCy9ArVYDAA4dOoSzZ89apDgiInIsgkPl6NGj+PWvf43y8nIcO3YMjY2NAIDS0lJs3brVYgUSEZHjEBwqmzdvxooVK/DRRx/B2dnZ0D5+/Hj8+OOPFimOyFq6M1HPSX6ifxM8UX/hwgXD++Vb8/T0xK1bt0Qtisja2pv0F4KT/ET/Jrin4unpifLycpP2s2fPYvDgwaIWRUREjklwqMyaNQvvvfce/u///g9OTk5obm5GYWEh1q9fjzlz5liyRiIichCCQ2Xp0qUICAjAo48+irq6Ojz55JN47rnnMHbsWLz00kuWrJGIiByE4DkVZ2dnvP/++1i6dClKSkqg0+kwcuRIDBs2zILlERGRIxEcKhqNBnq9HoGBgQgMDDS0NzY2wsnJCS4uLhYpkIiIHIfg4a/XXnsNu3fvNmnPzc3F0qVLRS2KiIgck+BQOXnyJCIjI03aIyMjUVxcLGpRRETkmASHSkNDAyQSiekO+vRBbW2tqEUREZFjEhwqUqkUBw+a3hyWn5+P4cOHi1oUERE5JsET9S+//DIWL16Mq1evYsKECQCAY8eO4a9//Suf/UVERADMCJXo6Ghs27YN27Ztw7vvvgsACAkJwccff9zm41uIeouGJi3cnE2Hhru7LpEjMuslXVFRUYiKirJULUQOyZznhvE5YdTTdenNjzU1NdDpdEZt999/vygFEfVkQnoq7M2QIxMcKj///DPefPNNFBYWoqmpydCu1+vh5OQEpVJpkQKJepLWvZr2XnvM3gw5MsGhkpGRgdu3b+Pdd9+Fn58fnJycLFkXERE5IMGhcvr0afzXf/0XgoODLVkPERE5MMH3qTz44IPQaDSWrIWIiByc4FBZuXIlNm3ahKtXr1qyHqJer7PXE/P1xWTPBA9/LV68GE1NTZg5cyZcXFxMHtly8uRJ0Ysj6o06u0SZE/lkzwSHyurVqy1ZBxER9QCCQyUuLs6SdRARUQ8geE4FACorK7Fjxw68+eabuHHjBgDgX//6F8rKyixSHBERORbBoXLmzBnMnDkT+fn5+PLLLw2Pu1coFPjwww8tViARETkOwaGyfv16JCUlIS8vD87Ozob2yZMnc5KeiIgAmBEqJSUlbc6r+Pr6orKyUtSiiIjIMQkOFTc3N9y6dcuk/dKlS/D29ha1qI5cv34dCxYsQExMDGbPno2vv/7aascmsnft3cPCe1vIWgRf/TVt2jRs3boVW7ZsMbRdu3YNGzduxIwZMyxSXFskEglWrlyJkJAQVFRUID4+HtHR0ejXr5/VaiCyV+3d48J7W8haBPdU0tPTcevWLUyYMAENDQ2YP38+ZsyYgQEDBmDp0qWWrNGIn58fQkJCANwdevPy8mqzB0XUU7HXQfZMcE+lf//+yM3NxQ8//ICzZ89Cp9NBJpNh0qRJZh2wqKgIO3bsQElJCdRqNdatW4f4+HijdXJycrBjxw5UVFRg+PDhWLlyJcLDw032debMGeh0OjzwwANm1UDkyDq64549ErI1QT2VpqYmPPPMM7h06RImTpyI559/HikpKWYHCgDU1dUhODgYq1atgpubm8nygoICrF27FqmpqcjLy4NcLkdKSgpUKpXRejdv3kR6ejrWrFljdg1ERGQZgnoqzs7OuHbtmijvUImOjja80z4jI8Nk+c6dOxEXF4e5c+cCADIzM3HkyBHk5uYiLS0NAKDRaPDyyy8jJSUFYWFhgo7b2NjYpReJBet0kPQx6x5RIrsk9O9/Q0MDX7rXi4h9vgUPf8XGxuLzzz9Henq6aAe/l0ajQUlJCZKTk43aIyMjUVxcDODumyZXrFiBCRMmIDY2VvC+XV1dDXMxZmGgUA8h9O+/Uqns2r8VckidnW9zA0dwqNTX1yM/Px8KhQIymczkaqs33njDrAO3pbq6GlqtFj4+Pkbt3t7eUCgUAO4+FqagoABSqRSHDh0CAGzYsAFSqbTbxyciou4RHCoXL17EyJEjAcDkWV/WfLVweHg4zp07Z7XjERGRcIJDZfv27W2+R0VMXl5ekEgkJnfoV1VVwdfX12LHJSIicQiaMNBqtQgPD8fly5ctWoyLiwtkMplhqKuFQqGAXC636LGJiKj7BPVUJBIJ/P390dTU1O0D1tbWorS0FACg0+mgUqmgVCrh6ekJf39/LFq0CMuXL0doaCjCwsKQm5sLtVqNefPmdfvYRL1VQ5MWbs4SQW1E3WHW64Q3btyI9957DwMHDuzyAc+cOYOkpCTD5+zsbGRnZyMuLg5ZWVmIiYlBdXU1tm3bBrVajeDgYGzfvh0BAQFdPiZRb9fWDZNXsp5ss42oOwSHyh//+Edcu3YNUVFRGDx4MNzd3Y2W5+fnC9pPREQEzp8/3+E6CQkJSEhIEFoaERHZCcGh8vjjj1uyDiIi6gEEh8qSJUssWQcREfUAvF2ciAwamrQmd1dz8p7MIbinIpfLO7zJka8UJnJ87U3oEwklOFRWr15t9Lm5uRlnz57F3//+d6SmpopeGBEROR7BodLW++kBYOTIkTh27BgWLFggWlFEROSYuj2nMmHCBBw+fFiMWoiIyMF1O1QOHjwILy8vMWohIiIHJ3j4a/bs2SZtlZWVuHXrFt566y0xayIiO3Lv41zaerwLUYsu3/zo5OSEgQMHYvz48QgKChK9MCKyD/deEcarwagjvPmRiLqldc+FvRgSPKdSWFiIwsLCNtuLiopELYqI7Ne9N0O29GSGrTjIQCHhobJu3TrU1NSYtN+5cwfr1q0TtSgisl+tQ+TeGyWJBIfK5cuX23wP/PDhwy3+8i4icgz39mL4iJfeR/CciqurKyoqKhAYGGjUXl5eDmdnZ9ELIyLHw0l9EtxTmTx5MjZu3Ihbt24Z2m7evIlNmzZh8uTJFimOiIgci+CeSnp6OhITEzF16lTDMNj58+fh7e2NDz74wGIFEhGR4xAcKn5+fti/fz/y8/OhVCoB3H0e2KxZs0zeAklEBPBy495IcKgAgLu7O+bOnWupWoioh2k9x8L5ld5B8JzKBx98gNzcXJP23NxcfPjhh6IWRUREjklwqOzfvx8jR440aZfJZNi/f7+oRRERkWMSHCpVVVUYOHCgSbuXlxcqKytFLYqIiByT4FDx9/fHiRMnTNqLioowePBgUYsiIiLHJHii/tlnn8W6devQ1NSECRMmAAB++OEHbNq0CS+88ILFCiQiIschOFSSk5NRXV2N3/3ud9BoNAAAFxcXJCUlISUlxWIFEhGR4zDrkuIXX3wR0dHRcHFxAQAEBQXBw8PDIoURUc/Ce1Z6B0GholKpsGbNGvzzn/+EXq8HcPclXVFRUVi9ejX8/f0tWiQROT7es9I7dBoq5eXlmDt3Lvr06YNXX30VjzzyCADgwoUL2L17N5599lns3bsXgwYNsnixRERk3zoNla1bt+LBBx/Erl274ObmZmh/7LHHsHDhQiQnJ+Ojjz7CmjVrLFooERHZv04vKf7uu++wbNkyo0Bp4e7ujqVLl+Lbb7+1RG1E1EO1fs9KZ+9caVnOd7M4hk5D5caNGxgyZEi7y4cOHYobN26IWhQR9WzmvIK4ZV1O7DuGTkPF29sbV69ebXf5lStX4O3tLWpRRNQ7sVfi+DoNlaioKHz44YeGe1Naa2xsxObNmxEdHW2R4oiod2GvxPF1OlG/ZMkSPP3005g+fToSEhLw8MMPAwAuXryI3bt3Q6vV8inFREQEQECoDBo0CHv27MHbb7+NDz74wOg+lcmTJ2P16tW8nJiIiAAIvPnxwQcfxKeffopbt24Z5leGDBmC+++/36LFERGRYzHrMS2enp4IDQ21VC2CvfzyyygsLMTEiROxZcsWW5dDRES/EPzoe3uSlJSE9evX27oMIiK6h0OGSkREBB9kSdSD8ZJix2X1UCkqKkJqaiqmTJkCqVSKffv2mayTk5ODqVOnYtSoUYiPj2/z5WBE1HO1vjmSHIvVQ6Wurg7BwcFYtWpVm49+KSgowNq1a5Gamoq8vDzI5XKkpKRApVJZu1QisgJL90p4Q6V1WT1UoqOjsWzZMsycORN9+pgefufOnYiLi8PcuXMRFBSEzMxM+Pr6Ijc319qlEpEVWLpXwhsqrcusq78sTaPRoKSkBMnJyUbtkZGRKC4u7ta+GxsboVQqzd4uWKeDpI3wIyLr68q/4ZCQkG5t39M1NDSI+r3YVahUV1dDq9XCx8fHqN3b2xsKhcLweeHChTh37hzq6+sRFRWFzZs3Qy6Xd7hvV1dXo79cgjFQiOxGl/4Ni7h9T6RUKjv8XswNHLsKFaF27dpl6xKIyMZaXknMVxPbF7v6v+FeXl6QSCSorKw0aq+qqoKvr6+NqiIie8S5EvtkV6Hi4uICmUxmNNQFAAqFotPhLSIisj2rD3/V1taitLQUAKDT6aBSqaBUKuHp6Ql/f38sWrQIy5cvR2hoKMLCwpCbmwu1Wo158+ZZu1QisiOdDXdxOMw+WL2ncubMGcTGxiI2NhYNDQ3Izs5GbGys4RleMTExyMjIwLZt2zBnzhycPHkS27dvR0BAgLVLJSI70tlwF4fD7IPVeyoRERE4f/58h+skJCQgISHBShURkSNjz8S+2NWcChGRuVp6KGQfGCpERCQahgoREYmGoUJEDoUPhrRvDBUiciicQ7FvDBUiIhINQ4WIiETDUCGiXqGhSWvywq622qh7GCpE1Cu4OUsMN0m2vvued+KLi6FCRESiYagQEZFoGCpERCQahgoR9ShdnXhvPWlvzjIyxlAhoh6l9cS7OTdJtp7IN2cZGWOoEBGRaBgqREQkGoYKERGJhqFCRESiYagQUa9x7xVc7X3mo1u6jqFCRL3GvVeE3XtF172PbOFj9s3HUCEiItEwVIiISDQMFSIiEg1DhYiIRMNQISIi0TBUiIhINAwVIiISTV9bF0BEZG9abnrs6MnEba3T1s2Sve3pxgwVIqJ7CAmCttbpbQHSFg5/ERGRaBgqREQkGoYKERGJhqFCRESiYagQEZFoGCpERCQahgoREYmGoUJERKJxuFD55ptv8Pjjj2PGjBn44osvbF0OERG14lB31Dc3NyMrKwufffYZ+vfvj/j4eDz22GPw8vKydWlERAQH66n8+OOPeOSRRzBo0CB4eHggKioK33//va3LIiKiX1g1VIqKipCamoopU6ZAKpVi3759Juvk5ORg6tSpGDVqFOLj43HixAnDMrVajUGDBhk+Dxo0COXl5VapnYiIOmfVUKmrq0NwcDBWrVoFNzc3k+UFBQVYu3YtUlNTkZeXB7lcjpSUFKhUKmuWSUREXWTVOZXo6GhER0cDADIyMkyW79y5E3FxcZg7dy4AIDMzE0eOHEFubi7S0tLg5+dn1DMpLy9HaGiooGM3NjZCqVSaXXOwTgdJH4caJSQiETQ0aU2eOtz6d0hISAgamrTQ6/WGNneXvibb1dY3oo9EAp3W9LH4pVcuwcfHB5WVlRgy7GH0kUggcQIu/nTBaL0hwx42Wb/FvZ9br9vWcpOfs6GhS78b22M3E/UajQYlJSVITk42ao+MjERxcTEAIDQ0FBcuXEB5eTn69++Pf/7zn1i8eLGg/bu6uiIkJMT8whgoRL1SW4+xv/d3iJDH33u4u/7yv0x/3bbsz9fXt8PjdLb+vZ/v3Udby1solcoOfzeaGzh2EyrV1dXQarXw8fExavf29oZCoQAA9O3bF+np6UhKSoJOp8MLL7zAK7+IiOyI3YSKUNOmTcO0adNsXQYREbXBbsZ2vLy8IJFITMb+qqqqOuy6ERGR/bCbUHFxcYFMJjMMdbVQKBSQy+U2qoqIiMxh1eGv2tpalJaWAgB0Oh1UKhWUSiU8PT3h7++PRYsWYfny5QgNDUVYWBhyc3OhVqsxb948a5ZJRERdZNVQOXPmDJKSkgyfs7OzkZ2djbi4OGRlZSEmJgbV1dXYtm0b1Go1goODsX37dgQEBFizTCIi6iKrhkpERATOnz/f4ToJCQlISEiwUkVERCQmu5lTISIix+ekb307aA926tQpuLq6dr4iEREZNDY2YsyYMYLX7zWhQkRElsfhLyIiEg1DhYiIRMNQISIi0TBUiIhINAwVIiISDUOFiIhEw1AhIiLRMFSIiEg0DBXqlpdffhnjxo3Dq6++autSyMKuX7+OBQsWICYmBrNnz8bXX39t65LIgmpqahAfH485c+Zg1qxZ+PzzzwVtxzvqqVuOHz+O2tpa5OXlYcuWLbYuhyxIrVajqqoKISEhqKioQHx8PP72t7+hX79+ti6NLECr1UKj0cDd3R11dXWYNWsWvvzyy05f4c6eCnVLREQEPDw8bF0GWYGfnx9CQkIAAL6+vvDy8sKtW7dsXBVZikQigbu7OwBAo9EAAIT0QRgqvVhRURFSU1MxZcoUSKVS7Nu3z2SdnJwcTJ06FaNGjUJ8fDxOnDhhg0pJDGKe7zNnzkCn0+GBBx6wdNnURWKc75qaGjz11FOIjo7G888/j4EDB3Z6XIZKL1ZXV4fg4GCsWrUKbm5uJssLCgqwdu1apKamIi8vD3K5HCkpKVCpVDaolrpLrPN98+ZNpKenY82aNdYqnbpAjPM9YMAAHDhwAP/4xz+Qn5+PysrKzg+sJ9Lr9WPGjNF/+eWXRm2//vWv9atWrTJqmz59un7jxo1Gbcew6sJ+AAAJTUlEQVSOHdO/8sorFq+RxNPV893Y2KifP3++/quvvrJKnSSO7vz7bvHmm2/qv/76606PxZ4KtUmj0aCkpASRkZFG7ZGRkSguLrZRVWQpQs63Xq/HihUrMGHCBMTGxtqiTBKJkPNdWVmJO3fuAABu376NEydO4KGHHup031Z9nTA5jurqami1Wvj4+Bi1e3t7Q6FQGD4vXLgQ586dQ319PaKiorB582bI5XJrl0vdJOR8/+tf/0JBQQGkUikOHToEANiwYQOkUqnV66XuEXK+VSoVMjMzodfrodfrkZiYKOhcM1SoW3bt2mXrEshKwsPDce7cOVuXQVYSGhqK/fv3m70dh7+oTV5eXpBIJCYTc1VVVfD19bVRVWQpPN+9iyXPN0OF2uTi4gKZTGY01AUACoWCw1s9EM9372LJ883hr16strYWpaWlAACdTgeVSgWlUglPT0/4+/tj0aJFWL58OUJDQxEWFobc3Fyo1WrMmzfPxpVTV/B89y62Ot98TEsvdvz4cSQlJZm0x8XFISsrC8Ddm6N27NgBtVqN4OBgZGRkYNy4cdYulUTA89272Op8M1SIiEg0nFMhIiLRMFSIiEg0DBUiIhINQ4WIiETDUCEiItEwVIiISDQMFSIiEg1DhchMUqkUf/3rX21dBpFd4s2P1COsWLECX331FYC779b28/NDdHQ0li1bBk9PT1GPVVFRAU9PT7i4uIi6345oNBp89tln+O///m9cvnwZrq6uGDZsGOLj4xEfH2/VWvbt24d33nmH79WhNvHZX9RjTJo0CRs2bIBWq8VPP/2ElStX4vbt29i0aZOox7H2U3s1Gg2ef/55KJVKvPrqqwgPD8d9992H06dPY9euXXjooYcQERFh1ZqI2sPhL+oxXFxc4Ovri8GDB2Py5MmIiYnB999/b7TO7du3kZmZiYkTJ0IulyMxMRGnT58GANy5cwehoaE4fPiw0TZHjx6FTCZDVVUVANPhr/LycvzHf/wHxo0bh3HjxuHFF1/ElStXANx9qJ9MJsOpU6cM60dHR2PmzJmGzwqFAmPGjIFGo2nz5/rTn/6EoqIi7Ny5E0lJSRg5ciQCAwMRExODPXv2QCaTAbgbPu+++y4mTZqEUaNGYe7cuThx4oRhP8ePH4dUKsWNGzcMbdeuXYNUKjV8By3r/PDDD3jmmWcwevRoxMfHo6SkxLA8IyMDdXV1kEqlkEqlyM7OFnB2qLdgqFCPVFZWhiNHjqBv3393xvV6PV588UWUl5fj97//PfLy8hAeHo7nnnsOarUa/fv3x6OPPor8/HyjfeXn52PSpEnw9vY2OU59fT2SkpLg6uqKP//5z9izZw98fX2xaNEi1NfXw8PDAzKZDIWFhQCAq1evoqamBiqVChUVFQDu/qIeM2ZMu0NYLccfNWqUybI+ffqgf//+AO6+hfHrr7/G2rVrkZeXh+DgYKSkpECtVpv9/b3//vtIS0vDvn374OXlhddffx16vR5yuRwrV66Eu7s7jh49iqNHjyI5Odns/VPPxVChHuPIkSOQy+UIDQ3FY489hp9++gkpKSmG5ceOHcO5c+ewZcsWhIaGYujQoVi6dCkCAwMNb7h76qmncPjwYcO7uRsaGvA///M/eOqpp9o85sGDB6HX67Fu3TqMGDECQUFBWLNmDerq6vDNN98AAMaPH4/jx48DAAoLCzF27FiMHj3aqG38+PHt/lxXr17Fww8/3OHPXldXhz179uD111/Hr371KwQFBeHtt9+Gt7c3cnJyBH6D//baa69hwoQJCAoKwuLFi3Hp0iWUl5fDxcUF9913H5ycnODr6wtfX194eHiYvX/quTinQj1GeHg43nnnHTQ0NOCLL75AaWkpFixYYFheUlKC+vp6TJw40Wi7xsZGlJWVAQCioqLg5uaGQ4cOITY2FocPH4Zer8djjz3W5jFLSkpw7do1hIWFGbXX19cb9jl+/Hjk5OSgqakJx48fR0REBBoaGlBYWIhp06bh9OnTSEtLa/fnEnItTWlpKZqamozqkEgkGDNmDC5evNjp9vdq/S5yPz8/AHffCjh48GCz90W9C0OFegx3d3cMHToUAPDGG29gwYIF+Pjjj/HKK68AuPuiIh8fnzb/n3vLEJKzszOeeOIJ5OfnIzY2FgcOHMD06dPh7u7e5jF1Oh1GjBiBDz74wGRZy1VnY8eOhUajwenTp1FUVISkpCTU19dj9erVKC4uRt++fREaGtruzzVs2DBcunTJvC+jFScnJwB3h8ru1dzc3OY2rYcNW7bX6XRdroF6Dw5/UY+1ZMkSfPrppygvLwcAyGQyVFZWok+fPhg6dKjRf63nS5566in88MMP+Omnn3D06NF2h75a9llaWgovLy+Tfd5///0AYJhX+eKLL3Dnzh3IZDKMGTMG169fR35+fofzKQAwa9YsKBQKw2R6azqdDnfu3MGQIUPg7OyMkydPGpZptVqcOnUKQUFBAICBAwcCgNEci1KpFPJVGnF2doZWqzV7O+odGCrUY0VEROCRRx7Btm3bANy95DgsLAyLFy/Gd999h7KyMhQXF2PLli1GV0mFhYXB398faWlpuP/++02Gy1qbPXs2vL29sXjxYhQWFqKsrAxFRUXIysoyXAEG3B0CO3DgAMLDwyGRSODq6orRo0fjwIEDHc6nAMDChQsxduxYJCcn47PPPoNSqURZWRn+/ve/Y/78+SgpKUG/fv3wm9/8Bhs3bsR3332Hixcv4q233kJVVRXmz58PABgyZAgeeOABbN26FZcvX8bRo0cN3405AgIC0NjYiO+//x43btxAfX292fugnouhQj3aokWLsHfvXvz8889wcnLC9u3bERERgczMTDzxxBNYunQpLl++bJg3aDF79mycO3cOTz75JCQSSbv7d3d3R05ODgIDA/Haa6/hiSeeQHp6Om7duoUBAwYY1hs/fjyam5uNAqSttra4uLhg586dSElJwd69e/Hss88iPj4ef/jDHxAbGwu5XA4A+M///E888cQTyMjIwJw5c3D+/Hl8+umnhp/N2dkZmzZtQllZGebMmYPs7GwsW7bM7O80LCwM8+bNw7JlyzBx4kT84Q9/MHsf1HPxjnoiIhINeypERCQahgoREYmGoUJERKJhqBARkWgYKkREJBqGChERiYahQkREomGoEBGRaP4/Ca94dmET6L0AAAAASUVORK5CYII=\n",
358 | "text/plain": [
359 | ""
360 | ]
361 | },
362 | "metadata": {},
363 | "output_type": "display_data"
364 | }
365 | ],
366 | "source": [
367 | "# ヒストグラムに十分位数を上書きする\n",
368 | "sns.set_style('whitegrid')\n",
369 | "fig, ax = plt.subplots()\n",
370 | "biz_df['review_count'].hist(ax=ax, bins=100)\n",
371 | "for pos in deciles:\n",
372 | " handle = plt.axvline(pos, color='r')\n",
373 | "ax.legend([handle], ['deciles'], fontsize=14)\n",
374 | "ax.set_yscale('log')\n",
375 | "ax.set_xscale('log')\n",
376 | "ax.tick_params(labelsize=14)\n",
377 | "ax.set_xlabel('Review Count', fontsize=14)\n",
378 | "ax.set_ylabel('Occurrence', fontsize=14)"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "metadata": {},
384 | "source": [
385 | "## 例 2-5. 分位数によるカウントの離散化"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 14,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "data": {
395 | "text/plain": [
396 | "array([1, 2, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 2, 1, 0, 3])"
397 | ]
398 | },
399 | "execution_count": 14,
400 | "metadata": {},
401 | "output_type": "execute_result"
402 | }
403 | ],
404 | "source": [
405 | "# 例2-3の large_count を引き続き使う\n",
406 | "import pandas as pd\n",
407 | "\n",
408 | "# 四分位数に変換\n",
409 | "pd.qcut(large_counts, 4, labels=False)"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 15,
415 | "metadata": {},
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "0.25 122.0\n",
421 | "0.50 926.0\n",
422 | "0.75 8286.0\n",
423 | "dtype: float64"
424 | ]
425 | },
426 | "execution_count": 15,
427 | "metadata": {},
428 | "output_type": "execute_result"
429 | }
430 | ],
431 | "source": [
432 | "# 分位数の計算\n",
433 | "large_counts_series = pd.Series(large_counts)\n",
434 | "large_counts_series.quantile([0.25, 0.5, 0.75])"
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "metadata": {},
440 | "source": [
441 | "## 例 2-6. 対数変換の前後でレビュー件数のヒストグラムを比較する"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 16,
447 | "metadata": {},
448 | "outputs": [
449 | {
450 | "data": {
451 | "text/plain": [
452 | "Text(0, 0.5, 'Occurrence')"
453 | ]
454 | },
455 | "execution_count": 16,
456 | "metadata": {},
457 | "output_type": "execute_result"
458 | },
459 | {
460 | "data": {
461 | "image/png": "\n",
462 | "text/plain": [
463 | ""
464 | ]
465 | },
466 | "metadata": {},
467 | "output_type": "display_data"
468 | }
469 | ],
470 | "source": [
471 | "import numpy as np\n",
472 | "\n",
473 | "# 例2-2で読み込んだ Yelp データセットの\n",
474 | "# データフレーム biz_df を使用して、レビュー件数を対数変換する。\n",
475 | "# レビュー件数 0 を対数変換してマイナス無限大になるのを防ぐために\n",
476 | "# 対数変換の前に生データに 1 を加算していることに注意。\n",
477 | "biz_df['log_review_count'] = np.log10(biz_df['review_count'] + 1)\n",
478 | "\n",
479 | "fig, (ax1, ax2) = plt.subplots(2,1)\n",
480 | "biz_df['review_count'].hist(ax=ax1, bins=100)\n",
481 | "ax1.tick_params(labelsize=14)\n",
482 | "ax1.set_xlabel('review_count', fontsize=14)\n",
483 | "ax1.set_ylabel('Occurrence', fontsize=14)\n",
484 | "\n",
485 | "biz_df['log_review_count'].hist(ax=ax2, bins=100)\n",
486 | "ax2.tick_params(labelsize=14)\n",
487 | "ax2.set_xlabel('log10(review_count))', fontsize=14)\n",
488 | "ax2.set_ylabel('Occurrence', fontsize=14)"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": []
497 | }
498 | ],
499 | "metadata": {
500 | "kernelspec": {
501 | "display_name": "Python 3",
502 | "language": "python",
503 | "name": "python3"
504 | },
505 | "language_info": {
506 | "codemirror_mode": {
507 | "name": "ipython",
508 | "version": 3
509 | },
510 | "file_extension": ".py",
511 | "mimetype": "text/x-python",
512 | "name": "python",
513 | "nbconvert_exporter": "python",
514 | "pygments_lexer": "ipython3",
515 | "version": "3.7.0"
516 | }
517 | },
518 | "nbformat": 4,
519 | "nbformat_minor": 2
520 | }
521 |
--------------------------------------------------------------------------------