├── .gitignore ├── README.md ├── luigi ├── data │ └── .gitkeep ├── logging.conf ├── luigi.cfg ├── model │ └── .gitkeep ├── sample_luigi.ipynb └── sample_luigi.py ├── mlflow ├── output.txt └── sample.py ├── nlp ├── swem.py ├── test_nlplot │ ├── 2020-08-08_pyldavis.html │ ├── introduction_nlplot_twitter.ipynb │ ├── unigram #データサイエンティストvs#kaggle.html │ └── words distribution #データサイエンティストvs#kaggle.html └── twitter_analytics_using_nlplot │ ├── .DS_Store │ ├── 2020-05-17_Co-occurrence network.html │ ├── 2020-05-17_Tree of Most Common Words.html │ ├── 2020-05-17_bi-gram.html │ ├── 2020-05-17_number of words distribution.html │ ├── 2020-05-17_pyldavis.html │ ├── 2020-05-17_sunburst chart.html │ ├── 2020-05-17_tri-gram.html │ ├── 2020-05-17_uni-gram.html │ ├── 2020-05-18_pyldavis.html │ ├── 2020-05-19_pyldavis.html │ ├── TwitterScraper.ipynb │ ├── introduction_nlplot_twitter.html │ ├── introduction_nlplot_twitter.ipynb │ ├── merge_data.ipynb │ ├── sample_twitter.csv │ ├── unigram #データサイエンティストvs#kaggle.html │ ├── wordcloud.png │ └── words distribution #データサイエンティストvs#kaggle.html ├── other └── hatenablog_css │ └── design.css ├── recommendation ├── graph │ ├── keras_tutorial.ipynb │ └── ml-latest-small │ │ └── README.txt └── matrix_factorization │ ├── data │ └── ml-25m │ │ └── README.txt │ └── keras_matrix_factorization.ipynb └── streamlit └── sample.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/code,linux,macos,python,pycharm,windows,jupyternotebooks 2 | # Edit at https://www.gitignore.io/?templates=code,linux,macos,python,pycharm,windows,jupyternotebooks 3 | 4 | ### Code ### 5 | .vscode/* 6 | !.vscode/settings.json 7 | !.vscode/tasks.json 8 | !.vscode/launch.json 9 | !.vscode/extensions.json 10 | 11 | ### JupyterNotebooks ### 12 | # gitignore template for Jupyter Notebooks 13 | # website: http://jupyter.org/ 14 | 15 | .ipynb_checkpoints 16 | */.ipynb_checkpoints/* 17 | 18 | # IPython 19 | profile_default/ 20 | ipython_config.py 21 | 22 | # Remove previous ipynb_checkpoints 23 | # git rm -r .ipynb_checkpoints/ 24 | 25 | ### Linux ### 26 | *~ 27 | 28 | # temporary files which can be created if a process still has a handle open of a deleted file 29 | .fuse_hidden* 30 | 31 | # KDE directory preferences 32 | .directory 33 | 34 | # Linux trash folder which might appear on any partition or disk 35 | .Trash-* 36 | 37 | # .nfs files are created when an open file is removed but is still being accessed 38 | .nfs* 39 | 40 | ### macOS ### 41 | # General 42 | .DS_Store 43 | .AppleDouble 44 | .LSOverride 45 | 46 | # Icon must end with two \r 47 | Icon 48 | 49 | # Thumbnails 50 | ._* 51 | 52 | # Files that might appear in the root of a volume 53 | .DocumentRevisions-V100 54 | .fseventsd 55 | .Spotlight-V100 56 | .TemporaryItems 57 | .Trashes 58 | .VolumeIcon.icns 59 | .com.apple.timemachine.donotpresent 60 | 61 | # Directories potentially created on remote AFP share 62 | .AppleDB 63 | .AppleDesktop 64 | Network Trash Folder 65 | Temporary Items 66 | .apdisk 67 | 68 | ### PyCharm ### 69 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 70 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 71 | 72 | # User-specific stuff 73 | .idea/**/workspace.xml 74 | .idea/**/tasks.xml 75 | .idea/**/usage.statistics.xml 76 | .idea/**/dictionaries 77 | .idea/**/shelf 78 | 79 | # Generated files 80 | .idea/**/contentModel.xml 81 | 82 | # Sensitive or high-churn files 83 | .idea/**/dataSources/ 84 | .idea/**/dataSources.ids 85 | .idea/**/dataSources.local.xml 86 | .idea/**/sqlDataSources.xml 87 | .idea/**/dynamic.xml 88 | .idea/**/uiDesigner.xml 89 | .idea/**/dbnavigator.xml 90 | 91 | # Gradle 92 | .idea/**/gradle.xml 93 | .idea/**/libraries 94 | 95 | # Gradle and Maven with auto-import 96 | # When using Gradle or Maven with auto-import, you should exclude module files, 97 | # since they will be recreated, and may cause churn. Uncomment if using 98 | # auto-import. 99 | # .idea/modules.xml 100 | # .idea/*.iml 101 | # .idea/modules 102 | # *.iml 103 | # *.ipr 104 | 105 | # CMake 106 | cmake-build-*/ 107 | 108 | # Mongo Explorer plugin 109 | .idea/**/mongoSettings.xml 110 | 111 | # File-based project format 112 | *.iws 113 | 114 | # IntelliJ 115 | out/ 116 | 117 | # mpeltonen/sbt-idea plugin 118 | .idea_modules/ 119 | 120 | # JIRA plugin 121 | atlassian-ide-plugin.xml 122 | 123 | # Cursive Clojure plugin 124 | .idea/replstate.xml 125 | 126 | # Crashlytics plugin (for Android Studio and IntelliJ) 127 | com_crashlytics_export_strings.xml 128 | crashlytics.properties 129 | crashlytics-build.properties 130 | fabric.properties 131 | 132 | # Editor-based Rest Client 133 | .idea/httpRequests 134 | 135 | # Android studio 3.1+ serialized cache file 136 | .idea/caches/build_file_checksums.ser 137 | 138 | ### PyCharm Patch ### 139 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 140 | 141 | # *.iml 142 | # modules.xml 143 | # .idea/misc.xml 144 | # *.ipr 145 | 146 | # Sonarlint plugin 147 | .idea/**/sonarlint/ 148 | 149 | # SonarQube Plugin 150 | .idea/**/sonarIssues.xml 151 | 152 | # Markdown Navigator plugin 153 | .idea/**/markdown-navigator.xml 154 | .idea/**/markdown-navigator/ 155 | 156 | ### Python ### 157 | # Byte-compiled / optimized / DLL files 158 | __pycache__/ 159 | *.py[cod] 160 | *$py.class 161 | 162 | # C extensions 163 | *.so 164 | 165 | # Distribution / packaging 166 | .Python 167 | build/ 168 | develop-eggs/ 169 | dist/ 170 | downloads/ 171 | eggs/ 172 | .eggs/ 173 | lib/ 174 | lib64/ 175 | parts/ 176 | sdist/ 177 | var/ 178 | wheels/ 179 | pip-wheel-metadata/ 180 | share/python-wheels/ 181 | *.egg-info/ 182 | .installed.cfg 183 | *.egg 184 | MANIFEST 185 | 186 | # PyInstaller 187 | # Usually these files are written by a python script from a template 188 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 189 | *.manifest 190 | *.spec 191 | 192 | # Installer logs 193 | pip-log.txt 194 | pip-delete-this-directory.txt 195 | 196 | # Unit test / coverage reports 197 | htmlcov/ 198 | .tox/ 199 | .nox/ 200 | .coverage 201 | .coverage.* 202 | .cache 203 | nosetests.xml 204 | coverage.xml 205 | *.cover 206 | .hypothesis/ 207 | .pytest_cache/ 208 | 209 | # Translations 210 | *.mo 211 | *.pot 212 | 213 | # Scrapy stuff: 214 | .scrapy 215 | 216 | # Sphinx documentation 217 | docs/_build/ 218 | 219 | # PyBuilder 220 | target/ 221 | 222 | # pyenv 223 | .python-version 224 | 225 | # pipenv 226 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 227 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 228 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 229 | # install all needed dependencies. 230 | #Pipfile.lock 231 | 232 | # celery beat schedule file 233 | celerybeat-schedule 234 | 235 | # SageMath parsed files 236 | *.sage.py 237 | 238 | # Spyder project settings 239 | .spyderproject 240 | .spyproject 241 | 242 | # Rope project settings 243 | .ropeproject 244 | 245 | # Mr Developer 246 | .mr.developer.cfg 247 | .project 248 | .pydevproject 249 | 250 | # mkdocs documentation 251 | /site 252 | 253 | # mypy 254 | .mypy_cache/ 255 | .dmypy.json 256 | dmypy.json 257 | 258 | # Pyre type checker 259 | .pyre/ 260 | 261 | ### Windows ### 262 | # Windows thumbnail cache files 263 | Thumbs.db 264 | Thumbs.db:encryptable 265 | ehthumbs.db 266 | ehthumbs_vista.db 267 | 268 | # Dump file 269 | *.stackdump 270 | 271 | # Folder config file 272 | [Dd]esktop.ini 273 | 274 | # Recycle Bin used on file shares 275 | $RECYCLE.BIN/ 276 | 277 | # Windows Installer files 278 | *.cab 279 | *.msi 280 | *.msix 281 | *.msm 282 | *.msp 283 | 284 | # Windows shortcuts 285 | *.lnk 286 | 287 | # End of https://www.gitignore.io/api/code,linux,macos,python,pycharm,windows,jupyternotebooks 288 | 289 | .idea 290 | 291 | # add 292 | *.csv 293 | *.tsv 294 | *.bz 295 | *.pkl 296 | *.pyc 297 | *.model 298 | *.png 299 | *.dat 300 | *.zip 301 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # takapy_blog 2 | -------------------------------------------------------------------------------- /luigi/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takapy0210/geek_blog/e36604f01d26f4d14bdacc6bb2995c929d49fdfa/luigi/data/.gitkeep -------------------------------------------------------------------------------- /luigi/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=streamHandler 6 | 7 | [logger_root] 8 | level=INFO 9 | handlers=streamHandler 10 | 11 | [formatters] 12 | keys=simpleFormatter 13 | 14 | [handler_streamHandler] 15 | class=logging.StreamHandler 16 | level=INFO 17 | formatter=simpleFormatter 18 | 19 | [formatter_simpleFormatter] 20 | format=[%(asctime)s] [%(levelname)5s] %(message)s 21 | datefmt=%Y-%m-%d %H:%M:%S 22 | -------------------------------------------------------------------------------- /luigi/luigi.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # 不要なログを出力しないための設定 3 | log_level=INFO 4 | logging_conf_file=logging.conf 5 | 6 | [retcode] 7 | # エラーを通知するための設定 8 | already_running=10 9 | missing_data=10 10 | not_run=10 11 | task_failed=10 12 | scheduling_error=10 13 | unhandled_exception=10 14 | -------------------------------------------------------------------------------- /luigi/model/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takapy0210/geek_blog/e36604f01d26f4d14bdacc6bb2995c929d49fdfa/luigi/model/.gitkeep -------------------------------------------------------------------------------- /luigi/sample_luigi.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import warnings 3 | import logging 4 | 5 | import pandas as pd 6 | import luigi 7 | from luigi.util import requires 8 | from sklearn import datasets 9 | from sklearn.preprocessing import OneHotEncoder 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.metrics import classification_report, accuracy_score 13 | 14 | warnings.filterwarnings("ignore") 15 | logger = logging.getLogger() 16 | 17 | 18 | class LoadDataset(luigi.Task): 19 | """データセットをロードするクラス""" 20 | task_namespace = 'titanic_tasks' 21 | 22 | def output(self): 23 | # return luigi.LocalTarget("data/titanic.csv") # csvで出力する場合 24 | return luigi.LocalTarget("data/titanic.pkl", format=luigi.format.Nop) 25 | 26 | def run(self): 27 | # titanicデータの読み込み 28 | df = datasets.fetch_openml("titanic", version=1, as_frame=True, return_X_y=False).frame 29 | logger.info(f'Data shape: {df.shape}') 30 | 31 | # pklで出力する 32 | with self.output().open('w') as f: 33 | f.write(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) 34 | 35 | # csvで出力したい場合は普通にpandasで出力する 36 | # 型が崩れる可能性があるので非推奨ではある 37 | # df.to_csv("data/titanic.csv", index=False) 38 | 39 | 40 | @requires(LoadDataset) 41 | class Processing(luigi.Task): 42 | """データの加工を行う""" 43 | task_namespace = 'titanic_tasks' 44 | 45 | def output(self): 46 | # return luigi.LocalTarget("data/processing_titanic.csv") # csvで出力する場合 47 | return luigi.LocalTarget("data/processing_titanic.pkl", format=luigi.format.Nop) 48 | 49 | def run(self): 50 | # データの読み込み 51 | with self.input().open() as f: 52 | # df = pd.read_csv(f) # pandasで読み込むパターン 53 | df = pickle.load(f) # pickleで読み込むパターン 54 | logger.info(f'Before Data shape: {df.shape}') 55 | 56 | # 欠損値処理 57 | df.loc[:, 'age'] = df['age'].fillna(df['age'].mean()) 58 | df.loc[:, 'fare'] = df['fare'].fillna(df['fare'].mean()) 59 | 60 | # カテゴリエンコード 61 | categorical_cols = ["pclass", "sex", "embarked"] 62 | df = self.sklearn_oh_encoder(df=df, cols=categorical_cols, drop_col=True) 63 | logger.info(f'After Data shape: {df.shape}') 64 | 65 | # 学習に使用するカラムのみを出力 66 | use_cols = [ 67 | 'survived', 68 | 'age', 69 | 'sibsp', 70 | 'parch', 71 | 'fare', 72 | 'pclass_1.0', 73 | 'pclass_2.0', 74 | 'pclass_3.0', 75 | 'sex_female', 76 | 'sex_male', 77 | 'embarked_C', 78 | 'embarked_Q', 79 | 'embarked_S', 80 | 'embarked_nan' 81 | ] 82 | df = df[use_cols] 83 | 84 | # 保存 85 | with self.output().open('w') as f: 86 | f.write(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) 87 | 88 | def sklearn_oh_encoder(self, df, cols, drop_col=False): 89 | """カテゴリ変換 90 | sklearnのOneHotEncoderでEncodingを行う 91 | 92 | Args: 93 | df: カテゴリ変換する対象のデータフレーム 94 | cols (list of str): カテゴリ変換する対象のカラムリスト 95 | drop_col (bool): エンコード対象のカラムを削除するか否か 96 | 97 | Returns: 98 | pd.Dataframe: dfにカテゴリ変換したカラムを追加したデータフレーム 99 | """ 100 | output_df = df.copy() 101 | for col in cols: 102 | ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') 103 | ohe_df = pd.DataFrame((ohe.fit_transform(output_df[[col]])), columns=ohe.categories_[0]) 104 | ohe_df = ohe_df.add_prefix(f'{col}_') 105 | # 元のDFに結合 106 | output_df = pd.concat([output_df, ohe_df], axis=1) 107 | if drop_col: 108 | output_df = output_df.drop(col, axis=1) 109 | return output_df 110 | 111 | 112 | @requires(Processing) 113 | class TrainTestSplit(luigi.Task): 114 | """データを学習データと検証データに分割する""" 115 | task_namespace = 'titanic_tasks' 116 | 117 | def output(self): 118 | return [luigi.LocalTarget("data/processing_titanic_train.pkl", format=luigi.format.Nop), 119 | luigi.LocalTarget("data/processing_titanic_test.pkl", format=luigi.format.Nop)] 120 | 121 | def run(self): 122 | # データの読み込み 123 | with self.input().open() as f: 124 | df = pickle.load(f) # pickleで読み込むパターン 125 | 126 | train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['survived'], random_state=42) 127 | logger.info(f'Train shape: {train.shape}') 128 | logger.info(f'Test shape: {test.shape}') 129 | 130 | with self.output()[0].open('w') as f: 131 | f.write(pickle.dumps(train, protocol=pickle.HIGHEST_PROTOCOL)) 132 | 133 | with self.output()[1].open('w') as f: 134 | f.write(pickle.dumps(test, protocol=pickle.HIGHEST_PROTOCOL)) 135 | 136 | 137 | @requires(TrainTestSplit) 138 | class Training(luigi.Task): 139 | """学習""" 140 | task_namespace = 'titanic_tasks' 141 | 142 | def output(self): 143 | return luigi.LocalTarget("model/random_forest.model", format=luigi.format.Nop) 144 | 145 | def run(self): 146 | # データの読み込み 147 | with self.input()[0].open() as f: 148 | train = pickle.load(f) 149 | 150 | logger.info(f'Train shape: {train.shape}') 151 | 152 | target_col = 'survived' 153 | X_train = train.drop(target_col, axis=1) 154 | y_train = train[target_col] 155 | 156 | model = RandomForestClassifier(random_state=1) 157 | model.fit(X_train, y_train) 158 | 159 | # 保存 160 | with self.output().open('w') as f: 161 | f.write(pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)) 162 | 163 | 164 | @requires(TrainTestSplit, Training) 165 | class Predict(luigi.Task): 166 | """推論""" 167 | task_namespace = 'titanic_tasks' 168 | 169 | def output(self): 170 | return luigi.LocalTarget("data/predict_data.csv") 171 | 172 | def run(self): 173 | # データの読み込み 174 | with self.input()[0][1].open() as f: 175 | valid = pickle.load(f) 176 | 177 | # モデルの読み込み 178 | with self.input()[1].open() as f: 179 | model = pickle.load(f) 180 | 181 | logger.info(f'Valid data shape: {valid.shape}') 182 | 183 | target_col = 'survived' 184 | X_valid = valid.drop(target_col, axis=1) 185 | y_valid = valid[target_col] 186 | 187 | # 予測 188 | y_pred = model.predict(X_valid) 189 | logger.info(f'Accuracy Score: {accuracy_score(y_valid, y_pred)}') 190 | logger.info('\n' + classification_report(y_valid, y_pred)) 191 | 192 | # # 保存 193 | valid.loc[:, 'y_pred'] = y_pred 194 | valid.to_csv('data/predict_data.csv', index=False) 195 | 196 | 197 | @requires(Predict) 198 | class MyInvokerTask(luigi.WrapperTask): 199 | task_namespace = 'titanic_tasks' 200 | pass 201 | 202 | 203 | if __name__ == '__main__': 204 | 205 | # 設定ファイルの読み込み 206 | luigi.configuration.LuigiConfigParser.add_config_path('./luigi.cfg') 207 | # 実行 208 | luigi.build([MyInvokerTask()], local_scheduler=True) 209 | # luigi.build([MyInvokerTask()], local_scheduler=False) # ブラウザからチェックしたい場合はこちら 210 | -------------------------------------------------------------------------------- /mlflow/output.txt: -------------------------------------------------------------------------------- 1 | Hello world sample! -------------------------------------------------------------------------------- /mlflow/sample.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | from mlflow import log_metric, log_param, log_artifact, set_tag 3 | 4 | if __name__ == "__main__": 5 | 6 | tracking_uri = '/Users/takapy/python/competition/mlflow/mlruns' 7 | mlflow.set_tracking_uri(tracking_uri) 8 | mlflow.set_experiment("test-experiment") 9 | mlflow.start_run(run_name='run_name001') 10 | 11 | # Log a parameter (key-value pair) 12 | log_param('param1', 42) 13 | 14 | # Log a metric; metrics can be updated throughout the run 15 | log_metric('fold1_score', 9.99) 16 | log_metric('fold2_score', 9.92) 17 | log_metric('fold3_score', 9.78) 18 | 19 | # Log an artifact (output file) 20 | with open("output.txt", "w") as f: 21 | f.write("Hello world sample!") 22 | 23 | log_artifact("output.txt") 24 | 25 | set_tag('tag1', 'this is tag1') 26 | set_tag('tag2', 'this is tag2') 27 | 28 | mlflow.end_run() 29 | -------------------------------------------------------------------------------- /nlp/swem.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from gensim.models import word2vec 4 | 5 | 6 | class SWEM(): 7 | """単語埋め込み (Word Embedding) のみを利用して文章埋め込み (Sentence Embedding) を計算する 8 | 9 | 参考URL:https://arxiv.org/abs/1805.09843v1 10 | 11 | Attributes: 12 | word2vec (word2vec): word2vecの事前学習モデル 13 | dim (int): word2vecの事前学習モデルの次元数 14 | oov_initialize_range (int): word2vecの事前学習モデルに含まれていない単語に割り当てるベクトル 15 | 16 | """ 17 | 18 | def __init__(self, word2vec_model_name): 19 | self.word2vec = word2vec.Word2Vec.load(word2vec_model_name) 20 | self.dim = self.word2vec.trainables.layer1_size 21 | self.oov_initialize_range = (-0.01, 0.01) 22 | 23 | def get_word_embeddings(self, words) -> list: 24 | """word2vecから単語のベクトルを取得 25 | 26 | Args: 27 | words (list of str): 重みを取得したい単語のリスト 28 | 29 | Returns: 30 | list (float): 全単語のベクトルが格納された2次元リスト 31 | 32 | """ 33 | np.random.seed(abs(hash(len(words))) % (10 ** 8)) 34 | vectors = [] 35 | for w in words: 36 | if w in self.word2vec: 37 | vectors.append(self.word2vec[w]) 38 | else: 39 | vectors.append(np.random.uniform(self.oov_initialize_range[0], self.oov_initialize_range[1], self.dim)) 40 | return vectors 41 | 42 | def average_pooling(self, text) -> np.array: 43 | """textに含まれる全単語ベクトルの次元毎の平均を計算する 44 | 45 | Args: 46 | text (str): ベクトルを計算したい文章 47 | 48 | Returns: 49 | np.array: 計算後のベクトル 50 | 51 | """ 52 | emb = [] 53 | for words in text: 54 | word_embeddings = self.get_word_embeddings(words) 55 | emb.append(np.nanmean(word_embeddings, axis=0)) 56 | return np.array(emb) 57 | 58 | def max_pooling(self, text) -> np.array: 59 | """textに含まれる全単語ベクトルの次元毎の最大値を計算する 60 | 61 | Args: 62 | text (str): ベクトルを計算したい文章 63 | 64 | Returns: 65 | np.array: 計算後のベクトル 66 | 67 | """ 68 | emb = [] 69 | for words in text: 70 | word_embeddings = self.get_word_embeddings(words) 71 | emb.append(np.max(word_embeddings, axis=0)) 72 | return np.array(emb) 73 | 74 | def concat_average_max_pooling(self, text) -> np.array: 75 | """textに含まれる全単語ベクトルの次元毎の平均値と最大値を計算した後それぞれを結合したベクトルを計算する 76 | 77 | 平均ベクトル[1, 3, 4, 2, -2]と最大値ベクトル[5, 7, 3, 1, 3]があった場合に 78 | [1, 3, 4, 2, -2, 5, 7, 3, 1, 3]のベクトルを定義しreturnする 79 | 80 | Args: 81 | text (str): ベクトルを計算したい文章 82 | 83 | Returns: 84 | np.array: 計算後のベクトル 85 | 86 | """ 87 | emb = [] 88 | for words in text: 89 | word_embeddings = self.get_word_embeddings(words) 90 | emb.append(np.r_[np.nanmean(word_embeddings, axis=0), np.max(word_embeddings, axis=0)]) 91 | return np.array(emb) 92 | 93 | def hier_or_avg_pooling(self, text, window) -> np.array: 94 | """textに含まれる単語に対してn-gramのように固定長のウィンドウでaverage-poolingした結果に対してmax poolingする 95 | 96 | 単語数がwindowに満たない場合は、単純な平均(average_pooling)を計算する 97 | 98 | Args: 99 | text (str): ベクトルを計算したい文章 100 | window (int): n-gramのウィンドウの幅 101 | 102 | Returns: 103 | np.array: 計算後のベクトル 104 | 105 | """ 106 | emb = [] 107 | for words in text: 108 | word_embeddings = self.get_word_embeddings(words) 109 | text_len = len(word_embeddings) 110 | if window > text_len: 111 | emb.append(np.nanmean(word_embeddings, axis=0)) 112 | else: 113 | window_average_pooling_vec = [np.nanmean(word_embeddings[i:i + window], axis=0) 114 | for i in range(text_len - window + 1)] 115 | emb.append(np.max(window_average_pooling_vec, axis=0)) 116 | return np.array(emb) 117 | 118 | def calculate_emb(self, df, col, window, swem_type) -> pd.DataFrame: 119 | """swemを用いて質問の埋め込みを算出する 120 | 121 | Args: 122 | df (pd.Dataframe): 対象のDF 123 | col (str): token化後のテキストが設定されているカラム名 124 | window (int): hierarchical_poolingする際のwindow数 125 | swem_type (int): SWEMをどの計算方法で算出するかを指定 126 | (1:average_pooling, 2:max_pooling, 3:concat_average_max_pooling, 4:hier_or_avg_pooling) 127 | 128 | Returns: 129 | pd.DataFrame: 埋め込み(N次元)のデータフレーム 130 | 131 | """ 132 | 133 | # 質問の埋め込みを計算 134 | # swem_typeによって埋め込みの計算処理を分ける 135 | if swem_type == 1: 136 | swem_emb = self.average_pooling(df[col].values.tolist()) 137 | elif swem_type == 2: 138 | swem_emb = self.max_pooling(df[col].values.tolist()) 139 | elif swem_type == 3: 140 | swem_emb = self.concat_average_max_pooling(df[col].values.tolist()) 141 | else: 142 | swem_emb = self.hier_or_avg_pooling(df[col].values.tolist(), window) 143 | 144 | # データフレームに変換 145 | swem_emb = pd.DataFrame(swem_emb) 146 | swem_emb = swem_emb.add_prefix('d_') 147 | return swem_emb 148 | -------------------------------------------------------------------------------- /nlp/test_nlplot/2020-08-08_pyldavis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takapy0210/geek_blog/e36604f01d26f4d14bdacc6bb2995c929d49fdfa/nlp/twitter_analytics_using_nlplot/.DS_Store -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/2020-05-17_pyldavis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/2020-05-18_pyldavis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/2020-05-19_pyldavis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/merge_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 13, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "df1 = pd.read_csv('twitter_ai.csv')\n", 19 | "df2 = pd.read_csv('twitter_ds.csv')\n", 20 | "df3 = pd.read_csv('twitter_kaggle.csv')\n", 21 | "df4 = pd.read_csv('twitter_python.csv')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | "
searched_forpost_urlpostuserpost_timehashtagshashtag_len
0#人工知能https://twitter.com/benrishi1/status/125772625...経営者が知るべき人工知能が生むビジネスチャンスを つかむための戦略とは? \\nhttps:/...人工知能、機械学習、IoTニュース1.588700e+09機械学習 ai iot 人工知能4
1#人工知能https://twitter.com/ciwame_mizuo/status/124367...AIで風景を作る\\niteration/epoch: 11000/2\\n#makeingla...みずおNaNmakeinglandscape deeplearning nowlearning ai g...6
2#人工知能https://twitter.com/sachi_n1020/status/1257695...主婦だけど、何か仕事をしたいと思ってませんか?\\nスマホ片手で在宅でできますよ〜\\n#副業 ...naru@相互フォロー1.588693e+09働き方改革 副業探しています 集客 インターネット 社会問題 環境問題 相互フォロー ai ...11
3#人工知能https://twitter.com/JohnnyFakaHioki/status/124...シンギュラリティ後の世界が明るく楽しいものなら、その日を待つためだけに25年生きてもいいな。...草木の妖精-ジョニー・フローラ -Johnny Flora-NaNシンギュラリティ 人工知能2
4#人工知能https://twitter.com/kawachanaccount/status/125...AIは恋する #小説 #SF #人工知能 #プログラマー\\nhttps://t.co/oQz...KAWAMURA小説アカウント1.588693e+09小説 プログラマー sf 人工知能4
........................
6507#人工知能https://twitter.com/ciwame_mizuo/status/124368...AIで風景を作る\\niteration/epoch: 12000/2\\n#makeingla...みずおNaNmakeinglandscape deeplearning nowlearning ai g...6
6508#人工知能https://twitter.com/KyukyokuCom/status/1243682...人工知能(AI)が米国、数百万人の職を奪う!「やりすぎ都市伝説で言ってた人類選別がはじまる?...究極のまとめ.comNaN都市伝説 やりすぎ都市伝説 ai 人工知能4
6509#人工知能https://twitter.com/InfoAI4/status/12436819956...「なぜ」を理解するAIの構築を、あの人工知能の第一人者は目指している|https://t.c...Info_AINaNai 人工知能2
6510#人工知能https://twitter.com/meza_janken/status/1243679...【2戦目】めざましじゃんけん結果速報:\\nチョキ✌でした。グー✊が勝ちです。\\n今回の相手:...めざましじゃんけん結果速報NaNめざましじゃんけん じゃんけん みきいえ ゆりやんレトリィバァ めざましどようび めざましテ...9
6511#人工知能https://twitter.com/meza_janken/status/1243678...学習中の人工知能なので、とても低い勝率です。\\n次回3月28日 【2戦目】のじゃんけん予想は...めざましじゃんけん結果速報NaNめざましじゃんけん じゃんけん opencv keras raspberrypi rnn め...11
\n", 172 | "

6512 rows × 7 columns

\n", 173 | "
" 174 | ], 175 | "text/plain": [ 176 | " searched_for post_url \\\n", 177 | "0 #人工知能 https://twitter.com/benrishi1/status/125772625... \n", 178 | "1 #人工知能 https://twitter.com/ciwame_mizuo/status/124367... \n", 179 | "2 #人工知能 https://twitter.com/sachi_n1020/status/1257695... \n", 180 | "3 #人工知能 https://twitter.com/JohnnyFakaHioki/status/124... \n", 181 | "4 #人工知能 https://twitter.com/kawachanaccount/status/125... \n", 182 | "... ... ... \n", 183 | "6507 #人工知能 https://twitter.com/ciwame_mizuo/status/124368... \n", 184 | "6508 #人工知能 https://twitter.com/KyukyokuCom/status/1243682... \n", 185 | "6509 #人工知能 https://twitter.com/InfoAI4/status/12436819956... \n", 186 | "6510 #人工知能 https://twitter.com/meza_janken/status/1243679... \n", 187 | "6511 #人工知能 https://twitter.com/meza_janken/status/1243678... \n", 188 | "\n", 189 | " post \\\n", 190 | "0 経営者が知るべき人工知能が生むビジネスチャンスを つかむための戦略とは? \\nhttps:/... \n", 191 | "1 AIで風景を作る\\niteration/epoch: 11000/2\\n#makeingla... \n", 192 | "2 主婦だけど、何か仕事をしたいと思ってませんか?\\nスマホ片手で在宅でできますよ〜\\n#副業 ... \n", 193 | "3 シンギュラリティ後の世界が明るく楽しいものなら、その日を待つためだけに25年生きてもいいな。... \n", 194 | "4 AIは恋する #小説 #SF #人工知能 #プログラマー\\nhttps://t.co/oQz... \n", 195 | "... ... \n", 196 | "6507 AIで風景を作る\\niteration/epoch: 12000/2\\n#makeingla... \n", 197 | "6508 人工知能(AI)が米国、数百万人の職を奪う!「やりすぎ都市伝説で言ってた人類選別がはじまる?... \n", 198 | "6509 「なぜ」を理解するAIの構築を、あの人工知能の第一人者は目指している|https://t.c... \n", 199 | "6510 【2戦目】めざましじゃんけん結果速報:\\nチョキ✌でした。グー✊が勝ちです。\\n今回の相手:... \n", 200 | "6511 学習中の人工知能なので、とても低い勝率です。\\n次回3月28日 【2戦目】のじゃんけん予想は... \n", 201 | "\n", 202 | " user post_time \\\n", 203 | "0 人工知能、機械学習、IoTニュース 1.588700e+09 \n", 204 | "1 みずお NaN \n", 205 | "2 naru@相互フォロー 1.588693e+09 \n", 206 | "3 草木の妖精-ジョニー・フローラ -Johnny Flora- NaN \n", 207 | "4 KAWAMURA小説アカウント 1.588693e+09 \n", 208 | "... ... ... \n", 209 | "6507 みずお NaN \n", 210 | "6508 究極のまとめ.com NaN \n", 211 | "6509 Info_AI NaN \n", 212 | "6510 めざましじゃんけん結果速報 NaN \n", 213 | "6511 めざましじゃんけん結果速報 NaN \n", 214 | "\n", 215 | " hashtags hashtag_len \n", 216 | "0 機械学習 ai iot 人工知能 4 \n", 217 | "1 makeinglandscape deeplearning nowlearning ai g... 6 \n", 218 | "2 働き方改革 副業探しています 集客 インターネット 社会問題 環境問題 相互フォロー ai ... 11 \n", 219 | "3 シンギュラリティ 人工知能 2 \n", 220 | "4 小説 プログラマー sf 人工知能 4 \n", 221 | "... ... ... \n", 222 | "6507 makeinglandscape deeplearning nowlearning ai g... 6 \n", 223 | "6508 都市伝説 やりすぎ都市伝説 ai 人工知能 4 \n", 224 | "6509 ai 人工知能 2 \n", 225 | "6510 めざましじゃんけん じゃんけん みきいえ ゆりやんレトリィバァ めざましどようび めざましテ... 9 \n", 226 | "6511 めざましじゃんけん じゃんけん opencv keras raspberrypi rnn め... 11 \n", 227 | "\n", 228 | "[6512 rows x 7 columns]" 229 | ] 230 | }, 231 | "execution_count": 14, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "df1" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 15, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/html": [ 248 | "
\n", 249 | "\n", 262 | "\n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | "
searched_forpost_urlpostuserpost_timehashtagshashtag_len
0#データサイエンティストhttps://twitter.com/K03475916/status/125781058...#今日の積み上げ\\n・Udemy中級編7.8\\n・DS養成講座30回31回課題\\n・prog...K1.588720e+09筋トレ 今日の積み上げ 駆け出しエンジニアと繋がりたい データサイエンティスト4
1#データサイエンティストhttps://twitter.com/ponpoko_code/status/844690...データ分析って給料高いな~。人材不足らしい。統計学、プログラミングに加えて人工知能も出来ると...ぽんぽこ@プロぽこ-プログラミングNaN統計学 人工知能 ダイヤモンド データサイエンティスト プログラミング5
2#データサイエンティストhttps://twitter.com/sazan_dora/status/79743408...データサイエンティストになりたい人向けまとめ - NAVER まとめ https://t.c...サザンドラNaNデータマイニング データサイエンス データサイエンティスト3
3#データサイエンティストhttps://twitter.com/WXBC_jp/status/11302604362...【#野球 ネタ⚾️】\\n\\n#ヤクルトスワローズ の #山田哲人 選手の成績データを、気温・...気象ビジネス推進コンソーシアム(略称:WXBC,事務局:気象庁)NaN山田哲人 気象 データサイエンティスト ヤクルトスワローズ4
4#データサイエンティストhttps://twitter.com/hamahiroshi164/status/1257...「データサイエンティストを目指す人のzoom飲み会」終わりました💡\\n\\n現場のリアルが知れ...DSになる男ハマヒロシ30歳子煩悩1.588689e+09プログラミング初心者 データサイエンティスト2
........................
1484#データサイエンティストhttps://twitter.com/Iggjj21/status/12162382360...#今日の積み上げ \\n\\n✔️PYQでpandasの演習\\n→主に表の作成方法\\n✔️cou...ゆう|新社会人エンジニアNaN駆け出しエンジニア python 今日の積み上げ 機械学習 データサイエンティスト ai 駆...7
1485#データサイエンティストhttps://twitter.com/fumi_bz/status/10116001000...Agile HR Day にて。そもそもいないデータサイエンティスト人材‥っていう、スライド...fumi@HR Tech商品企画NaN機械学習 hrtech ディープラーニング データサイエンティスト4
1486#データサイエンティストhttps://twitter.com/joe_i/status/1794720198705...#データサイエンティスト 不足の問題は昨日も話題に上がってましたが、本当にそうですよねー。昨...石山 城 / Joe ISHIYAMANaNデータサイエンティスト opencu2
1487#データサイエンティストhttps://twitter.com/intern_aogaku/status/79775...統計学を学ぶ学生必見!\\n\\nIoT企業でデータサイエンティストの有給インターンシップ募集!...青山学院大学 インターン就活おすすめ情報NaNインターンシップ インターン マーケティング データサイエンス データサイエンティスト 統計...7
1488#データサイエンティストhttps://twitter.com/shibaurakogyo_u/status/797...統計学を学ぶ学生必見!\\n\\nIoT企業でデータサイエンティストの有給インターンシップ募集!...芝浦工業大学 インターン就活おすすめ情報NaNインターンシップ インターン マーケティング データサイエンス データサイエンティスト 統計...7
\n", 388 | "

1489 rows × 7 columns

\n", 389 | "
" 390 | ], 391 | "text/plain": [ 392 | " searched_for post_url \\\n", 393 | "0 #データサイエンティスト https://twitter.com/K03475916/status/125781058... \n", 394 | "1 #データサイエンティスト https://twitter.com/ponpoko_code/status/844690... \n", 395 | "2 #データサイエンティスト https://twitter.com/sazan_dora/status/79743408... \n", 396 | "3 #データサイエンティスト https://twitter.com/WXBC_jp/status/11302604362... \n", 397 | "4 #データサイエンティスト https://twitter.com/hamahiroshi164/status/1257... \n", 398 | "... ... ... \n", 399 | "1484 #データサイエンティスト https://twitter.com/Iggjj21/status/12162382360... \n", 400 | "1485 #データサイエンティスト https://twitter.com/fumi_bz/status/10116001000... \n", 401 | "1486 #データサイエンティスト https://twitter.com/joe_i/status/1794720198705... \n", 402 | "1487 #データサイエンティスト https://twitter.com/intern_aogaku/status/79775... \n", 403 | "1488 #データサイエンティスト https://twitter.com/shibaurakogyo_u/status/797... \n", 404 | "\n", 405 | " post \\\n", 406 | "0 #今日の積み上げ\\n・Udemy中級編7.8\\n・DS養成講座30回31回課題\\n・prog... \n", 407 | "1 データ分析って給料高いな~。人材不足らしい。統計学、プログラミングに加えて人工知能も出来ると... \n", 408 | "2 データサイエンティストになりたい人向けまとめ - NAVER まとめ https://t.c... \n", 409 | "3 【#野球 ネタ⚾️】\\n\\n#ヤクルトスワローズ の #山田哲人 選手の成績データを、気温・... \n", 410 | "4 「データサイエンティストを目指す人のzoom飲み会」終わりました💡\\n\\n現場のリアルが知れ... \n", 411 | "... ... \n", 412 | "1484 #今日の積み上げ \\n\\n✔️PYQでpandasの演習\\n→主に表の作成方法\\n✔️cou... \n", 413 | "1485 Agile HR Day にて。そもそもいないデータサイエンティスト人材‥っていう、スライド... \n", 414 | "1486 #データサイエンティスト 不足の問題は昨日も話題に上がってましたが、本当にそうですよねー。昨... \n", 415 | "1487 統計学を学ぶ学生必見!\\n\\nIoT企業でデータサイエンティストの有給インターンシップ募集!... \n", 416 | "1488 統計学を学ぶ学生必見!\\n\\nIoT企業でデータサイエンティストの有給インターンシップ募集!... \n", 417 | "\n", 418 | " user post_time \\\n", 419 | "0 K 1.588720e+09 \n", 420 | "1 ぽんぽこ@プロぽこ-プログラミング NaN \n", 421 | "2 サザンドラ NaN \n", 422 | "3 気象ビジネス推進コンソーシアム(略称:WXBC,事務局:気象庁) NaN \n", 423 | "4 DSになる男ハマヒロシ30歳子煩悩 1.588689e+09 \n", 424 | "... ... ... \n", 425 | "1484 ゆう|新社会人エンジニア NaN \n", 426 | "1485 fumi@HR Tech商品企画 NaN \n", 427 | "1486 石山 城 / Joe ISHIYAMA NaN \n", 428 | "1487 青山学院大学 インターン就活おすすめ情報 NaN \n", 429 | "1488 芝浦工業大学 インターン就活おすすめ情報 NaN \n", 430 | "\n", 431 | " hashtags hashtag_len \n", 432 | "0 筋トレ 今日の積み上げ 駆け出しエンジニアと繋がりたい データサイエンティスト 4 \n", 433 | "1 統計学 人工知能 ダイヤモンド データサイエンティスト プログラミング 5 \n", 434 | "2 データマイニング データサイエンス データサイエンティスト 3 \n", 435 | "3 山田哲人 気象 データサイエンティスト ヤクルトスワローズ 4 \n", 436 | "4 プログラミング初心者 データサイエンティスト 2 \n", 437 | "... ... ... \n", 438 | "1484 駆け出しエンジニア python 今日の積み上げ 機械学習 データサイエンティスト ai 駆... 7 \n", 439 | "1485 機械学習 hrtech ディープラーニング データサイエンティスト 4 \n", 440 | "1486 データサイエンティスト opencu 2 \n", 441 | "1487 インターンシップ インターン マーケティング データサイエンス データサイエンティスト 統計... 7 \n", 442 | "1488 インターンシップ インターン マーケティング データサイエンス データサイエンティスト 統計... 7 \n", 443 | "\n", 444 | "[1489 rows x 7 columns]" 445 | ] 446 | }, 447 | "execution_count": 15, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "df2" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 16, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/html": [ 464 | "
\n", 465 | "\n", 478 | "\n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | "
searched_forpost_urlpostuserpost_timehashtagshashtag_len
0#kagglehttps://twitter.com/AbeShumpei/status/11002246...はじめて出して見た。8000番と順位は低いけどこれからがんばるぞいDoing my part...abe shumpeiNaNkaggle machinelearning2
1#kagglehttps://twitter.com/soh_1121_/status/125767813...◯ 実施内容\\n・ #Paiza 1h\\n・ #言語処理100本ノック 0.5h\\n・ #K...Soh1.588689e+09言語処理100本ノック apg4b 個人開発 kaggle 今日の積み上げ 駆け出しエンジニ...9
2#kagglehttps://twitter.com/WlWH74jtP0zHxmj/status/125...AIの目標はやっぱりKaggleかな\\nグランドマスターとか憧れすぎる\\nいろいろ挑戦してみ...やますん@駆け出しエンジニア1.588677e+09プログラミング学習 ai kaggle3
3#kagglehttps://twitter.com/bonjinjpn/status/110005131...#SIGNATE - Data Science Competition #ML #DL #A...K2NaNdl signate kaggle ai ml5
4#kagglehttps://twitter.com/tkydub/status/109950639977...届きました🙌 #kaggle #機械学習 #機械学習のための特徴量エンジニアリングtoki⛅NaNkaggle 機械学習のための特徴量エンジニアリング 機械学習3
........................
1219#kagglehttps://twitter.com/Wataoka_Koki/status/110109...kaggle二日目\\n\\nDNNのkernelを自分好みにアレンジしながら実装した。\\n\\n...綿岡 晃輝NaNsantander kaggle2
1220#kagglehttps://twitter.com/codexa_net/status/11010897...【Kaggle初心者入門編】タイタニック号で生き残るのは誰?\\n\\n#Kaggle #入門\\...codexa[コデクサ:人工知能特化WEBスクール]NaN入門 kaggle2
1221#kagglehttps://twitter.com/hntk03/status/110099597289...Feature Engineering #featureengineering #kaggl...hntkNaNfeatureengineering kaggle2
1222#kagglehttps://twitter.com/sa178kla/status/1100890178...もっと頑張ろうー\\nCausing #superintelligence, one subm...yoNaNsuperintelligence kaggle2
1223#kagglehttps://twitter.com/dichika/status/29512284824...適当にgbmにつっこんだら0.76555で924位ダッタヨ #kaggle #titanic棗太郎NaNtitanic kaggle2
\n", 604 | "

1224 rows × 7 columns

\n", 605 | "
" 606 | ], 607 | "text/plain": [ 608 | " searched_for post_url \\\n", 609 | "0 #kaggle https://twitter.com/AbeShumpei/status/11002246... \n", 610 | "1 #kaggle https://twitter.com/soh_1121_/status/125767813... \n", 611 | "2 #kaggle https://twitter.com/WlWH74jtP0zHxmj/status/125... \n", 612 | "3 #kaggle https://twitter.com/bonjinjpn/status/110005131... \n", 613 | "4 #kaggle https://twitter.com/tkydub/status/109950639977... \n", 614 | "... ... ... \n", 615 | "1219 #kaggle https://twitter.com/Wataoka_Koki/status/110109... \n", 616 | "1220 #kaggle https://twitter.com/codexa_net/status/11010897... \n", 617 | "1221 #kaggle https://twitter.com/hntk03/status/110099597289... \n", 618 | "1222 #kaggle https://twitter.com/sa178kla/status/1100890178... \n", 619 | "1223 #kaggle https://twitter.com/dichika/status/29512284824... \n", 620 | "\n", 621 | " post \\\n", 622 | "0 はじめて出して見た。8000番と順位は低いけどこれからがんばるぞいDoing my part... \n", 623 | "1 ◯ 実施内容\\n・ #Paiza 1h\\n・ #言語処理100本ノック 0.5h\\n・ #K... \n", 624 | "2 AIの目標はやっぱりKaggleかな\\nグランドマスターとか憧れすぎる\\nいろいろ挑戦してみ... \n", 625 | "3 #SIGNATE - Data Science Competition #ML #DL #A... \n", 626 | "4 届きました🙌 #kaggle #機械学習 #機械学習のための特徴量エンジニアリング \n", 627 | "... ... \n", 628 | "1219 kaggle二日目\\n\\nDNNのkernelを自分好みにアレンジしながら実装した。\\n\\n... \n", 629 | "1220 【Kaggle初心者入門編】タイタニック号で生き残るのは誰?\\n\\n#Kaggle #入門\\... \n", 630 | "1221 Feature Engineering #featureengineering #kaggl... \n", 631 | "1222 もっと頑張ろうー\\nCausing #superintelligence, one subm... \n", 632 | "1223 適当にgbmにつっこんだら0.76555で924位ダッタヨ #kaggle #titanic \n", 633 | "\n", 634 | " user post_time \\\n", 635 | "0 abe shumpei NaN \n", 636 | "1 Soh 1.588689e+09 \n", 637 | "2 やますん@駆け出しエンジニア 1.588677e+09 \n", 638 | "3 K2 NaN \n", 639 | "4 toki⛅ NaN \n", 640 | "... ... ... \n", 641 | "1219 綿岡 晃輝 NaN \n", 642 | "1220 codexa[コデクサ:人工知能特化WEBスクール] NaN \n", 643 | "1221 hntk NaN \n", 644 | "1222 yo NaN \n", 645 | "1223 棗太郎 NaN \n", 646 | "\n", 647 | " hashtags hashtag_len \n", 648 | "0 kaggle machinelearning 2 \n", 649 | "1 言語処理100本ノック apg4b 個人開発 kaggle 今日の積み上げ 駆け出しエンジニ... 9 \n", 650 | "2 プログラミング学習 ai kaggle 3 \n", 651 | "3 dl signate kaggle ai ml 5 \n", 652 | "4 kaggle 機械学習のための特徴量エンジニアリング 機械学習 3 \n", 653 | "... ... ... \n", 654 | "1219 santander kaggle 2 \n", 655 | "1220 入門 kaggle 2 \n", 656 | "1221 featureengineering kaggle 2 \n", 657 | "1222 superintelligence kaggle 2 \n", 658 | "1223 titanic kaggle 2 \n", 659 | "\n", 660 | "[1224 rows x 7 columns]" 661 | ] 662 | }, 663 | "execution_count": 16, 664 | "metadata": {}, 665 | "output_type": "execute_result" 666 | } 667 | ], 668 | "source": [ 669 | "df3" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 17, 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/html": [ 680 | "
\n", 681 | "\n", 694 | "\n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | "
searched_forpost_urlpostuserpost_timehashtagshashtag_len
0#pythonhttps://twitter.com/joekadowaki/status/1245109...「仕事のムダ」を省くための、コミュニケーションに関する2つのポイント\\nhttps://t....じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
1#pythonhttps://twitter.com/maronaru_sale_t/status/125...LINEの半bot、保護botのソースや\\nPythonista3の使い方\\nソースの書き方...U&I iryuSale LINEbot販売1.588720e+09半bot プログラミング pythonista 保護bot linebot python p...8
2#pythonhttps://twitter.com/joekadowaki/status/1245108...「ビジネスメール詐欺の実態調査報告書」を公開 JPCERT/CC\\nhttps://t.co...じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
3#pythonhttps://twitter.com/sheep_0411/status/12578075...日経平均\\n2020年05月01日の終値:19,619.35円\\n(前日比-2.84%)\\n...シープ1.588720e+09heroku bot python 日経平均4
4#pythonhttps://twitter.com/joekadowaki/status/1245108...ライス大学研究チーム、GPUを使わずにディープラーニングを高速化するアルゴリズムを開発\\nh...じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
........................
5270#pythonhttps://twitter.com/Kuno39962216/status/124511...2020年4月1日学習\\n・Udemy 【世界で5万人が受講】実践 Pythonデータサイエ...kuno&pandapiNaNudemy python データサイエンス3
5271#pythonhttps://twitter.com/BASEBALLY15/status/1245109...#今日の積み上げ\\n今日も頑張りましょう😊\\n\\n▶️ #Python ブログ執筆\\n▶️ ...ゆうき@Python+C💻ブログで発信📝NaN今日の積み上げ python 駆け出しエンジニアと繋がりたい3
5272#pythonhttps://twitter.com/joekadowaki/status/1245109...リモートワークで疲弊する人へ|この際、生産性は忘れよう\\nhttps://t.co/mJnU...じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
5273#pythonhttps://twitter.com/joekadowaki/status/1245109...地頭のいい人≠アイデアマン。「問題解決」への最短距離のつくり方\\nhttps://t.co/...じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
5274#pythonhttps://twitter.com/joekadowaki/status/1245109...オンライン会議を生かす、スムーズな進め方と注意点\\nhttps://t.co/w88CTnb...じょーNaNpython technology 駆け出しエンジニアと繋がりたい3
\n", 820 | "

5275 rows × 7 columns

\n", 821 | "
" 822 | ], 823 | "text/plain": [ 824 | " searched_for post_url \\\n", 825 | "0 #python https://twitter.com/joekadowaki/status/1245109... \n", 826 | "1 #python https://twitter.com/maronaru_sale_t/status/125... \n", 827 | "2 #python https://twitter.com/joekadowaki/status/1245108... \n", 828 | "3 #python https://twitter.com/sheep_0411/status/12578075... \n", 829 | "4 #python https://twitter.com/joekadowaki/status/1245108... \n", 830 | "... ... ... \n", 831 | "5270 #python https://twitter.com/Kuno39962216/status/124511... \n", 832 | "5271 #python https://twitter.com/BASEBALLY15/status/1245109... \n", 833 | "5272 #python https://twitter.com/joekadowaki/status/1245109... \n", 834 | "5273 #python https://twitter.com/joekadowaki/status/1245109... \n", 835 | "5274 #python https://twitter.com/joekadowaki/status/1245109... \n", 836 | "\n", 837 | " post \\\n", 838 | "0 「仕事のムダ」を省くための、コミュニケーションに関する2つのポイント\\nhttps://t.... \n", 839 | "1 LINEの半bot、保護botのソースや\\nPythonista3の使い方\\nソースの書き方... \n", 840 | "2 「ビジネスメール詐欺の実態調査報告書」を公開 JPCERT/CC\\nhttps://t.co... \n", 841 | "3 日経平均\\n2020年05月01日の終値:19,619.35円\\n(前日比-2.84%)\\n... \n", 842 | "4 ライス大学研究チーム、GPUを使わずにディープラーニングを高速化するアルゴリズムを開発\\nh... \n", 843 | "... ... \n", 844 | "5270 2020年4月1日学習\\n・Udemy 【世界で5万人が受講】実践 Pythonデータサイエ... \n", 845 | "5271 #今日の積み上げ\\n今日も頑張りましょう😊\\n\\n▶️ #Python ブログ執筆\\n▶️ ... \n", 846 | "5272 リモートワークで疲弊する人へ|この際、生産性は忘れよう\\nhttps://t.co/mJnU... \n", 847 | "5273 地頭のいい人≠アイデアマン。「問題解決」への最短距離のつくり方\\nhttps://t.co/... \n", 848 | "5274 オンライン会議を生かす、スムーズな進め方と注意点\\nhttps://t.co/w88CTnb... \n", 849 | "\n", 850 | " user post_time \\\n", 851 | "0 じょー NaN \n", 852 | "1 U&I iryuSale LINEbot販売 1.588720e+09 \n", 853 | "2 じょー NaN \n", 854 | "3 シープ 1.588720e+09 \n", 855 | "4 じょー NaN \n", 856 | "... ... ... \n", 857 | "5270 kuno&pandapi NaN \n", 858 | "5271 ゆうき@Python+C💻ブログで発信📝 NaN \n", 859 | "5272 じょー NaN \n", 860 | "5273 じょー NaN \n", 861 | "5274 じょー NaN \n", 862 | "\n", 863 | " hashtags hashtag_len \n", 864 | "0 python technology 駆け出しエンジニアと繋がりたい 3 \n", 865 | "1 半bot プログラミング pythonista 保護bot linebot python p... 8 \n", 866 | "2 python technology 駆け出しエンジニアと繋がりたい 3 \n", 867 | "3 heroku bot python 日経平均 4 \n", 868 | "4 python technology 駆け出しエンジニアと繋がりたい 3 \n", 869 | "... ... ... \n", 870 | "5270 udemy python データサイエンス 3 \n", 871 | "5271 今日の積み上げ python 駆け出しエンジニアと繋がりたい 3 \n", 872 | "5272 python technology 駆け出しエンジニアと繋がりたい 3 \n", 873 | "5273 python technology 駆け出しエンジニアと繋がりたい 3 \n", 874 | "5274 python technology 駆け出しエンジニアと繋がりたい 3 \n", 875 | "\n", 876 | "[5275 rows x 7 columns]" 877 | ] 878 | }, 879 | "execution_count": 17, 880 | "metadata": {}, 881 | "output_type": "execute_result" 882 | } 883 | ], 884 | "source": [ 885 | "df4" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 23, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "cols = ['searched_for', 'hashtags', 'hashtag_len']\n", 895 | "df1 = df1[cols]\n", 896 | "df2 = df2[cols]\n", 897 | "df3 = df3[cols]\n", 898 | "df4 = df4[cols]" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 24, 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [ 907 | "df = pd.concat([df1, df2], axis=0)\n", 908 | "df = pd.concat([df, df3], axis=0)\n", 909 | "df = pd.concat([df, df4], axis=0)" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 28, 915 | "metadata": {}, 916 | "outputs": [ 917 | { 918 | "data": { 919 | "text/html": [ 920 | "
\n", 921 | "\n", 934 | "\n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | "
searched_forhashtagshashtag_len
5643#人工知能トルコ ドローン 神風ドローン 顔認証 ai drone 人工知能7
2944#python駆け出しエンジニアとつながりたい python プログラミング初心者3
210#kagglekaggle 機械学習 データサイエンティスト3
2115#python駆け出しエンジニアと繋がりたい プログラミング python プログラミング学習 プログラミ...5
1275#データサイエンティスト統計学 統計学専攻 データサイエンティスト 統計 外資系 外資系企業6
............
1666#pythonpython 機械学習 回帰分析3
6165#人工知能makeinglandscape deeplearning nowlearning ai g...6
5210#pythonオホーツク fswebcam python イマソラ 北海道 photo raspberrypi7
873#人工知能ai 人工知能2
3881#人工知能セルフブランディング ai ルーチンワーク 人工知能4
\n", 1012 | "

100 rows × 3 columns

\n", 1013 | "
" 1014 | ], 1015 | "text/plain": [ 1016 | " searched_for hashtags \\\n", 1017 | "5643 #人工知能 トルコ ドローン 神風ドローン 顔認証 ai drone 人工知能 \n", 1018 | "2944 #python 駆け出しエンジニアとつながりたい python プログラミング初心者 \n", 1019 | "210 #kaggle kaggle 機械学習 データサイエンティスト \n", 1020 | "2115 #python 駆け出しエンジニアと繋がりたい プログラミング python プログラミング学習 プログラミ... \n", 1021 | "1275 #データサイエンティスト 統計学 統計学専攻 データサイエンティスト 統計 外資系 外資系企業 \n", 1022 | "... ... ... \n", 1023 | "1666 #python python 機械学習 回帰分析 \n", 1024 | "6165 #人工知能 makeinglandscape deeplearning nowlearning ai g... \n", 1025 | "5210 #python オホーツク fswebcam python イマソラ 北海道 photo raspberrypi \n", 1026 | "873 #人工知能 ai 人工知能 \n", 1027 | "3881 #人工知能 セルフブランディング ai ルーチンワーク 人工知能 \n", 1028 | "\n", 1029 | " hashtag_len \n", 1030 | "5643 7 \n", 1031 | "2944 3 \n", 1032 | "210 3 \n", 1033 | "2115 5 \n", 1034 | "1275 6 \n", 1035 | "... ... \n", 1036 | "1666 3 \n", 1037 | "6165 6 \n", 1038 | "5210 7 \n", 1039 | "873 2 \n", 1040 | "3881 4 \n", 1041 | "\n", 1042 | "[100 rows x 3 columns]" 1043 | ] 1044 | }, 1045 | "execution_count": 28, 1046 | "metadata": {}, 1047 | "output_type": "execute_result" 1048 | } 1049 | ], 1050 | "source": [ 1051 | "df.sample(100)" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": 26, 1057 | "metadata": {}, 1058 | "outputs": [], 1059 | "source": [ 1060 | "df.to_csv('twitter_post.csv', index=False)" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": {}, 1067 | "outputs": [], 1068 | "source": [] 1069 | } 1070 | ], 1071 | "metadata": { 1072 | "kernelspec": { 1073 | "display_name": "Python 3", 1074 | "language": "python", 1075 | "name": "python3" 1076 | }, 1077 | "language_info": { 1078 | "codemirror_mode": { 1079 | "name": "ipython", 1080 | "version": 3 1081 | }, 1082 | "file_extension": ".py", 1083 | "mimetype": "text/x-python", 1084 | "name": "python", 1085 | "nbconvert_exporter": "python", 1086 | "pygments_lexer": "ipython3", 1087 | "version": "3.7.2" 1088 | } 1089 | }, 1090 | "nbformat": 4, 1091 | "nbformat_minor": 4 1092 | } 1093 | -------------------------------------------------------------------------------- /nlp/twitter_analytics_using_nlplot/wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takapy0210/geek_blog/e36604f01d26f4d14bdacc6bb2995c929d49fdfa/nlp/twitter_analytics_using_nlplot/wordcloud.png -------------------------------------------------------------------------------- /other/hatenablog_css/design.css: -------------------------------------------------------------------------------- 1 | /* */ 2 | @import "https://blog.hatena.ne.jp/-/theme/8599973812270629022.css"; 3 | 4 | /* ブログタイトル文字サイズ変更 */ 5 | @media screen and (max-width: 640px){ 6 | #blog-title #title { 7 | font-size: 20px !important; 8 | } 9 | } 10 | 11 | /* タイトル下のブログ説明部分 (設定 > 基本設定 > ブログの説明) */ 12 | h2#blog-description { 13 | font-size: 12px !important; /* 文字サイズ変更 */ 14 | margin: 1px; /* 上下左右に余白追加 */ 15 | } 16 | 17 | /* 【タブレット、PC限定】 */ 18 | @media screen and (min-width:641px) { 19 | h2#blog-description { 20 | font-size: 18px !important; /* 文字サイズ変更 */ 21 | margin: 5px; /* 上下左右に余白追加 */ 22 | } 23 | } 24 | 25 | /* */ 26 | 27 | /* ### ヘッダーの背景色 ### */ 28 | #blog-title{ 29 | background: #f8f8ff !important; 30 | } 31 | /* ### ブログタイトルの文字色 ### */ 32 | #title a{ 33 | color: #カラーコード !important; 34 | } 35 | /* ### ブログ説明の文字色 ### */ 36 | #blog-description{ 37 | color: #カラーコード !important; 38 | } 39 | 40 | /* 41 | 枠 (div) の設定 (画面幅、境界線、背景色) 42 | -------------------------------------------------------*/ 43 | /* 【タブレット、PC限定】 */ 44 | @media screen and (min-width:641px) { 45 | /* 左右の余白を削除 */ 46 | #container { 47 | width: 80%; /* 本文エリアとサイドバー含めて画面いっぱいに広げる */ 48 | } 49 | 50 | /* 本文エリア全体 */ 51 | #main { 52 | background-color: #ffffff; /* 本文エリアの背景色 */ 53 | /*border: 1px solid #bde6f2;*/ /* 本文エリアの枠線 */ 54 | border-radius: 5px; /* 角を丸くする */ 55 | float: left; /* サイドバーをfloat leftにするため */ 56 | margin: 30px 2% 0 8%; 57 | padding: 30px; /* 本文エリアの文字と枠線の間の余白 */ 58 | width: 50%; /* 画面幅50% */ 59 | } 60 | 61 | /* サイドバー */ 62 | #box2 { 63 | float: left; /* 本文エリアの方に詰める */ 64 | margin: 30px 3% 0 2%; 65 | width: 22%; /* 画面幅22% 画面拡大しても崩れにくいよう余裕持たせてます */ 66 | } 67 | } 68 | 69 | /* 70 | 基本的なフォント設定 (見出しの設定は、「記事エリアの設定」で行う) 71 | -------------------------------------------------------*/ 72 | /* 全体のフォントファミリー指定 */ 73 | /* ★ WEBフォントの追加設定 */ 74 | .entry-content, .entry-title, body { 75 | font-family: 'Noto Sans', 'Noto Sans JP', 'Hiragino Kaku Gothic ProN', メイリオ, Meiryo, sans-serif; 76 | /*letter-spacing: 0.0005em;*/ /* 字間を0.05字分空けて読みやすくする */ 77 | } 78 | 79 | /* 80 | 見出しの設定 81 | -------------------------------------------------------*/ 82 | /* 記事タイトルのフォント */ 83 | h1.entry-title a { 84 | font-size: 15px; 85 | } 86 | 87 | /* 【タブレット、PC限定】 */ 88 | @media screen and (min-width:641px) { 89 | h1.entry-title a { 90 | font-size: 22px; 91 | } 92 | } 93 | 94 | /* 見出し1の設定 */ 95 | .entry-content h1 { 96 | padding: 0.6em 0.6em;/*上下 左右の余白*/ 97 | border-left: solid 2.5px #ffa8a8;/*左線*/ 98 | font-size: 22px; 99 | background: #f9fcff;/*背景色*/ 100 | /*padding: 10px 20px 15px 20px;*/ 101 | } 102 | 103 | /* 見出し2の設定 */ 104 | .entry-content h2 { 105 | padding: 0.2em 0.4em;/*上下 左右の余白*/ 106 | color: #494949;/*文字色*/ 107 | background: transparent;/*背景透明に*/ 108 | border-bottom: solid 1.5px #d3d3d3; 109 | font-size: 20px; 110 | } 111 | 112 | /* 見出し3の設定 */ 113 | .entry-content h3 { 114 | padding: 0.2em 0.2em;/*上下 左右の余白*/ 115 | color: #494949;/*文字色*/ 116 | font-size: 18px; 117 | } 118 | 119 | /* 120 | 強調文字列部分の設定 (マークダウンでいう右記の部分: ** 文字列 **) 121 | -------------------------------------------------------*/ 122 | /* 蛍光ペンの設定 */ 123 | /* (補足) rgbaの4番目の引数は透明度を表す。0で透明、1で完全に塗りつぶす。transparentは一部ブラウザで黒と解釈されるので使わない */ 124 | /* (補足) 60%の数値を両方増やすと蛍光ペンが細くなる。両方減らすと太くなる */ 125 | /* (補足) 前半を20%、後半を80%とすると、上から20%の位置を透明、80%の位置を色付きとし、間はグラデーションになる */ 126 | /* ■蛍光ペンのデザイン:https://naifix.com/strong-css-sample/ */ 127 | .entry-content strong { 128 | background: linear-gradient(rgba(246, 210, 139, 0) 60%, rgba(246, 210, 139, 1) 60%); 129 | border-radius: 2px; /* 角を丸める */ 130 | } 131 | 132 | 133 | /* 行間の設定 */ 134 | /*.entry-content p {margin:0.1}*/ 135 | .entry-content p{ 136 | line-height: 1.8em; /* 行間の幅調整*/ 137 | letter-spacing:0.4pt; /* 文字の間隔調整*/ 138 | /*font-size:15px;*/ 139 | } 140 | 141 | /* 142 | 目次の設定 143 | -------------------------------------------------------*/ 144 | /* 見出し1に対応する目次の余白調整、および自動採番用の変数セット */ 145 | ul.table-of-contents > li { 146 | margin-top: 0.1em; 147 | list-style-type: none; 148 | counter-increment: mokuji-1; /* mokuji-1という変数に1を足す */ 149 | counter-reset: mokuji-2; /* mokuji-2という変数の値を0に戻す */ 150 | line-height:132%; 151 | } 152 | 153 | /* 見出し1に対応する目次の自動採番 */ 154 | ul.table-of-contents > li::before{ 155 | content: counter(mokuji-1) ". "; /* 文字列挿入。"1. " のような形式 */ 156 | } 157 | 158 | /* 見出し2に対応する目次の余白調整、および自動採番用の変数セット */ 159 | ul.table-of-contents ul > li { 160 | list-style-type: none; 161 | margin-top: 0; 162 | counter-increment: mokuji-2; 163 | line-height:132%; 164 | } 165 | 166 | /* 見出し2に対応する目次の自動採番 */ 167 | ul.table-of-contents ul > li::before { 168 | content: counter(mokuji-1) "." counter(mokuji-2) ". "; /* 文字列挿入。"1.1. " のような形式 */ 169 | } 170 | 171 | /* 見出し3以降に対応する目次を非表示にする */ 172 | ul.table-of-contents ul ul { 173 | display: none; 174 | line-height:132%; 175 | } 176 | 177 | .entry-content .table-of-contents { 178 | /*margin: 2em 2em;*/ /*目次上下の余白*/ 179 | padding: 3em 0 2em 2em; /*目次下内部余白 上,右,下,左*/ 180 | /*margin: 0 0 0 3.5em;*/ /*左側余白*/ 181 | /*padding: 0.01em 0 0 0 !important;*/ /*行間余白*/ 182 | /*border: 1px solid #ddd;*/ /*枠線のスタイル*/ 183 | /*background-color: #fff;*/ /*目次内背景色*/ 184 | /*font-size: 0.95em;*/ /*文字サイズ*/ 185 | /*font-weight: normal;*/ /*文字太さ*/ 186 | border-radius: 5px; /*角を丸める*/ 187 | } 188 | 189 | ul.table-of-contents > li a:link{ 190 | color:#ad8383; /*色はここを変更*/ 191 |  text-decoration: none; 192 | } 193 | 194 | .entry-content .table-of-contents a:link{ 195 |  color:#ad8383; /*色はここを変更*/ 196 |  text-decoration: none; 197 | } 198 | 199 | ul.table-of-contents > li a:visited{ 200 | color:#8c6a6a; /*色はここを変更*/ 201 | } 202 | 203 | .entry-content .table-of-contents a:visited{ 204 |  color: #8c6a6a; /*色はここを変更*/ 205 | } 206 | 207 | .entry-content .table-of-contents a:hover{ 208 |  text-decoration:underline; /*下線をつける*/ 209 | } 210 | 211 | /* 212 | ソースコードのシンタックスハイライト (Syntax Highlighting) の書式 213 | -------------------------------------------------------*/ 214 | /* 文字サイズ変更 */ 215 | .entry-content pre.code { 216 | font-size:90%; 217 | line-height:150%; 218 | } 219 | 220 | /* コードの背景色変更 */ 221 | .entry-content pre.code { 222 | background-color: #3F3F3F; 223 | color: #DCDCDC; 224 | } 225 | /* 226 | pre.code ol{ 227 | margin-top: 0; 228 | margin-bottom: 0; 229 | } 230 | pre.code .code-list{ 231 | border-left: 1px solid #999999; *縦線* 232 | padding-left:6px; 233 | } 234 | pre.code .code-list:nth-child(2n+1) { 235 | background-color: #424242; *奇数行の背景色* 236 | } 237 | */ 238 | .synSpecial { color: #cc9393; } 239 | .synType { color: #E3CEAB; } 240 | .synComment { color: #7A987A; } 241 | .synPreProc { color: #8c8cb4; } 242 | .synIdentifier { color: #6e96be; } 243 | .synConstant { color: #cc9393; } 244 | .synStatement { color: #efc986; } 245 | 246 | 247 | /* 248 | 追尾する目次 249 | -------------------------------------------------------*/ 250 | #stoc-module { 251 | backface-visibility: hidden; 252 | } 253 | #stoc-module.tracking { 254 | margin-bottom: 0; 255 | } 256 | #stoc-module.fixed { 257 | position: fixed; 258 | } 259 | #stoc-module.absolute { 260 | position: absolute; 261 | } 262 | #stoc-module.sticky { 263 | position: -webkit-sticky; 264 | position: sticky; 265 | } 266 | #stoc-module.fade-in { 267 | animation: fadeIn 300ms; 268 | } 269 | @keyframes fadeIn { 270 | 0% {opacity: 0} 271 | 100% {opacity: 1} 272 | } 273 | 274 | #stoc { 275 | overflow-y: auto; 276 | } 277 | #stoc.shadow { 278 | /* Shadows */ 279 | background: 280 | radial-gradient(farthest-side at top, rgba(0,0,0,.17), transparent) top / 100% 11px, 281 | radial-gradient(farthest-side at bottom, rgba(0,0,0,.17), transparent) bottom / 100% 11px; 282 | background-repeat: no-repeat; 283 | background-attachment: scroll; 284 | } 285 | #stoc ol { 286 | margin: 0; 287 | padding: 0 0 0 1em; 288 | list-style-type: none; 289 | } 290 | #stoc > ol { 291 | padding-left: 0; 292 | } 293 | #stoc.shadow > ol { 294 | /* Shadow covers */ 295 | background: 296 | linear-gradient(#fff 30%, transparent) top / 100% 40px, 297 | linear-gradient(transparent, #fff 70%) bottom / 100% 40px; 298 | background-repeat: no-repeat; 299 | background-attachment: local; 300 | } 301 | #stoc a { 302 | padding: 2px 2px 2px 6px; 303 | display: block; 304 | text-decoration: none; 305 | } 306 | #stoc:not(.touch) a:hover { 307 | background-color: rgba(0,0,0,.04); 308 | text-decoration: underline; 309 | } 310 | #stoc .active { 311 | background-color: rgba(0,0,0,.04); 312 | } 313 | 314 | #stoc::-webkit-scrollbar { 315 | width: 8px; 316 | background: #ececec; 317 | } 318 | #stoc::-webkit-scrollbar-button { 319 | display: none; 320 | } 321 | #stoc::-webkit-scrollbar-thumb { 322 | background: #b1b1b1; 323 | } 324 | 325 | 326 | /* 327 | Categoryをタイル表示に変更(Innocent) 328 | うまく動かない 329 | -------------------------------------------------------*/ 330 | /* 331 | .hatena-module-category .hatena-urllist { 332 | margin: 0 0 -6px; 333 | padding: 0; 334 | } 335 | .hatena-module-category .hatena-urllist li::before { 336 | content: none; 337 | } 338 | .hatena-module-category .hatena-urllist li { 339 | border-top: 0; 340 | display: block; 341 | float: left; 342 | margin: 0 6px 6px 0; 343 | padding: 0; 344 | } 345 | .hatena-module-category .hatena-urllist li a { 346 | border: 1px solid #e6e6e6; 347 | border-radius: 2px; 348 | display: block; 349 | font-size: 0.8667em; 350 | line-height: 32px; 351 | padding: 0 12px; 352 | } 353 | .hatena-module-category .hatena-urllist li a:hover { 354 | background-color: #f6f6f6; 355 | color: #333; 356 | } 357 | */ -------------------------------------------------------------------------------- /recommendation/graph/ml-latest-small/README.txt: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018. 5 | 6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided. 7 | 8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows. 9 | 10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent. 11 | 12 | This and other GroupLens data sets are publicly available for download at . 13 | 14 | 15 | Usage License 16 | ============= 17 | 18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions: 19 | 20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group. 21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information). 22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions. 23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota. 24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. 25 | 26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate). 27 | 28 | If you have any further questions or comments, please email 29 | 30 | 31 | Citation 32 | ======== 33 | 34 | To acknowledge use of the dataset in publications, please cite the following paper: 35 | 36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 37 | 38 | 39 | Further Information About GroupLens 40 | =================================== 41 | 42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including: 43 | 44 | * recommender systems 45 | * online communities 46 | * mobile and ubiquitious technologies 47 | * digital libraries 48 | * local geographic information systems 49 | 50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators. 51 | 52 | 53 | Content and Use of Files 54 | ======================== 55 | 56 | Formatting and Encoding 57 | ----------------------- 58 | 59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8. 60 | 61 | 62 | User Ids 63 | -------- 64 | 65 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files). 66 | 67 | 68 | Movie Ids 69 | --------- 70 | 71 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files). 72 | 73 | 74 | Ratings Data File Structure (ratings.csv) 75 | ----------------------------------------- 76 | 77 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format: 78 | 79 | userId,movieId,rating,timestamp 80 | 81 | The lines within this file are ordered first by userId, then, within user, by movieId. 82 | 83 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars). 84 | 85 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 86 | 87 | 88 | Tags Data File Structure (tags.csv) 89 | ----------------------------------- 90 | 91 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format: 92 | 93 | userId,movieId,tag,timestamp 94 | 95 | The lines within this file are ordered first by userId, then, within user, by movieId. 96 | 97 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user. 98 | 99 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 100 | 101 | 102 | Movies Data File Structure (movies.csv) 103 | --------------------------------------- 104 | 105 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format: 106 | 107 | movieId,title,genres 108 | 109 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles. 110 | 111 | Genres are a pipe-separated list, and are selected from the following: 112 | 113 | * Action 114 | * Adventure 115 | * Animation 116 | * Children's 117 | * Comedy 118 | * Crime 119 | * Documentary 120 | * Drama 121 | * Fantasy 122 | * Film-Noir 123 | * Horror 124 | * Musical 125 | * Mystery 126 | * Romance 127 | * Sci-Fi 128 | * Thriller 129 | * War 130 | * Western 131 | * (no genres listed) 132 | 133 | 134 | Links Data File Structure (links.csv) 135 | --------------------------------------- 136 | 137 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format: 138 | 139 | movieId,imdbId,tmdbId 140 | 141 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link . 142 | 143 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 144 | 145 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 146 | 147 | Use of the resources listed above is subject to the terms of each provider. 148 | 149 | 150 | Cross-Validation 151 | ---------------- 152 | 153 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples. 154 | -------------------------------------------------------------------------------- /recommendation/matrix_factorization/data/ml-25m/README.txt: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | This dataset (ml-25m) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 25000095 ratings and 1093360 tag applications across 62423 movies. These data were created by 162541 users between January 09, 1995 and November 21, 2019. This dataset was generated on November 21, 2019. 5 | 6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided. 7 | 8 | The data are contained in the files `genome-scores.csv`, `genome-tags.csv`, `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows. 9 | 10 | This and other GroupLens data sets are publicly available for download at . 11 | 12 | 13 | Usage License 14 | ============= 15 | 16 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions: 17 | 18 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group. 19 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information). 20 | * The user may not redistribute the data without separate permission. 21 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota. 22 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. 23 | 24 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate). 25 | 26 | If you have any further questions or comments, please email 27 | 28 | 29 | Citation 30 | ======== 31 | 32 | To acknowledge use of the dataset in publications, please cite the following paper: 33 | 34 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 35 | 36 | 37 | Further Information About GroupLens 38 | =================================== 39 | 40 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including: 41 | 42 | * recommender systems 43 | * online communities 44 | * mobile and ubiquitious technologies 45 | * digital libraries 46 | * local geographic information systems 47 | 48 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators. 49 | 50 | 51 | Content and Use of Files 52 | ======================== 53 | 54 | Verifying the Dataset Contents 55 | ------------------------------ 56 | 57 | We encourage you to verify that the dataset you have on your computer is identical to the ones hosted at [grouplens.org](http://grouplens.org). This is an important step if you downloaded the dataset from a location other than [grouplens.org](http://grouplens.org), or if you wish to publish research results based on analysis of the MovieLens dataset. 58 | 59 | We provide a [MD5 checksum](http://en.wikipedia.org/wiki/Md5sum) with the same name as the downloadable `.zip` file, but with a `.md5` file extension. To verify the dataset: 60 | 61 | # on linux 62 | md5sum ml-25m.zip; cat ml-25m.zip.md5 63 | 64 | # on OSX 65 | md5 ml-25m.zip; cat ml-25m.zip.md5 66 | 67 | # windows users can download a tool from Microsoft (or elsewhere) that verifies MD5 checksums 68 | 69 | Check that the two lines of output contain the same hash value. 70 | 71 | 72 | Formatting and Encoding 73 | ----------------------- 74 | 75 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8. 76 | 77 | 78 | User Ids 79 | -------- 80 | 81 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files). 82 | 83 | 84 | Movie Ids 85 | --------- 86 | 87 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files). 88 | 89 | 90 | Ratings Data File Structure (ratings.csv) 91 | ----------------------------------------- 92 | 93 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format: 94 | 95 | userId,movieId,rating,timestamp 96 | 97 | The lines within this file are ordered first by userId, then, within user, by movieId. 98 | 99 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars). 100 | 101 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 102 | 103 | 104 | Tags Data File Structure (tags.csv) 105 | ----------------------------------- 106 | 107 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format: 108 | 109 | userId,movieId,tag,timestamp 110 | 111 | The lines within this file are ordered first by userId, then, within user, by movieId. 112 | 113 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user. 114 | 115 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 116 | 117 | 118 | Movies Data File Structure (movies.csv) 119 | --------------------------------------- 120 | 121 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format: 122 | 123 | movieId,title,genres 124 | 125 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles. 126 | 127 | Genres are a pipe-separated list, and are selected from the following: 128 | 129 | * Action 130 | * Adventure 131 | * Animation 132 | * Children's 133 | * Comedy 134 | * Crime 135 | * Documentary 136 | * Drama 137 | * Fantasy 138 | * Film-Noir 139 | * Horror 140 | * Musical 141 | * Mystery 142 | * Romance 143 | * Sci-Fi 144 | * Thriller 145 | * War 146 | * Western 147 | * (no genres listed) 148 | 149 | 150 | Links Data File Structure (links.csv) 151 | --------------------------------------- 152 | 153 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format: 154 | 155 | movieId,imdbId,tmdbId 156 | 157 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link . 158 | 159 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 160 | 161 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 162 | 163 | Use of the resources listed above is subject to the terms of each provider. 164 | 165 | 166 | Tag Genome (genome-scores.csv and genome-tags.csv) 167 | ------------------------------------------------- 168 | 169 | This data set includes a current copy of the Tag Genome. 170 | 171 | [genome-paper]: http://files.grouplens.org/papers/tag_genome.pdf 172 | 173 | The tag genome is a data structure that contains tag relevance scores for movies. The structure is a dense matrix: each movie in the genome has a value for *every* tag in the genome. 174 | 175 | As described in [this article][genome-paper], the tag genome encodes how strongly movies exhibit particular properties represented by tags (atmospheric, thought-provoking, realistic, etc.). The tag genome was computed using a machine learning algorithm on user-contributed content including tags, ratings, and textual reviews. 176 | 177 | The genome is split into two files. The file `genome-scores.csv` contains movie-tag relevance data in the following format: 178 | 179 | movieId,tagId,relevance 180 | 181 | The second file, `genome-tags.csv`, provides the tag descriptions for the tag IDs in the genome file, in the following format: 182 | 183 | tagId,tag 184 | 185 | The `tagId` values are generated when the data set is exported, so they may vary from version to version of the MovieLens data sets. 186 | 187 | Please include the following citation if referencing tag genome data: 188 | 189 | > Jesse Vig, Shilad Sen, and John Riedl. 2012. The Tag Genome: Encoding Community Knowledge to Support Novel Interaction. ACM Trans. Interact. Intell. Syst. 2, 3: 13:1–13:44. 190 | 191 | 192 | Cross-Validation 193 | ---------------- 194 | 195 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples. 196 | -------------------------------------------------------------------------------- /streamlit/sample.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | 5 | st.title('streamlitのサンプルだお') 6 | 7 | DATE_COLUMN = 'date/time' 8 | DATA_URL = ('https://s3-us-west-2.amazonaws.com/' 9 | 'streamlit-demo-data/uber-raw-data-sep14.csv.gz') 10 | 11 | 12 | @st.cache 13 | def load_data(nrows): 14 | data = pd.read_csv(DATA_URL, nrows=nrows) 15 | lowercase = lambda x: str(x).lower() 16 | data.rename(lowercase, axis='columns', inplace=True) 17 | data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN]) 18 | return data 19 | 20 | # Create a text element and let the reader know the data is loading. 21 | data_load_state = st.text('Loading data...') 22 | # Load 10,000 rows of data into the dataframe. 23 | data = load_data(10000) 24 | # Notify the reader that the data was successfully loaded. 25 | data_load_state.text('Loading data...done!') 26 | 27 | if st.checkbox('Show raw data'): 28 | st.subheader('Raw data') 29 | st.write(data) 30 | 31 | 32 | 33 | st.subheader('Number of pickups by hour') 34 | hist_values = np.histogram(data[DATE_COLUMN].dt.hour, bins=24, range=(0,24))[0] 35 | st.bar_chart(hist_values) 36 | 37 | 38 | 39 | # Some number in the range 0-23 40 | hour_to_filter = st.slider('hour', 0, 23, 17) 41 | filtered_data = data[data[DATE_COLUMN].dt.hour == hour_to_filter] 42 | 43 | st.write(hour_to_filter) 44 | 45 | st.text('This is some text.') 46 | 47 | #ボタン処理 48 | if st.button('ボタンの処理も作れる'): 49 | #ボタン押された 50 | st.write('Why hello there') 51 | else: 52 | st.write('Goodbye') 53 | 54 | #テキスト入力欄も作れるよ 55 | st.text_area('labelだお', value="") --------------------------------------------------------------------------------