├── README.md
├── notebooks
├── .ipynb_checkpoints
│ ├── 01-数据处理-checkpoint.ipynb
│ ├── 02-特征提取-tfidf-checkpoint.ipynb
│ └── 03-训练模型-tfidf-checkpoint.ipynb
├── 02-特征提取-tfidf.ipynb
├── 03-训练模型-tfidf.ipynb
└── 01-数据处理.ipynb
└── src
├── 02-特征提取-tfidf.py
├── 04-train-fasttext.py
└── 03-训练模型-tfidf.py
/README.md:
--------------------------------------------------------------------------------
1 | # nlp_news_classification
2 | news classification of nlp task
3 |
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/01-数据处理-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/02-特征提取-tfidf-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 4
6 | }
7 |
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/03-训练模型-tfidf-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | }
10 | ],
11 | "metadata": {
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | }
17 | },
18 | "nbformat": 4,
19 | "nbformat_minor": 4
20 | }
21 |
--------------------------------------------------------------------------------
/src/02-特征提取-tfidf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 | from sklearn.externals import joblib
9 | from sklearn.feature_extraction.text import TfidfVectorizer
10 |
11 | trainData = pd.read_csv("../data/train_set.csv", sep="\t")
12 | print("trainData shape={}".format(trainData.shape))
13 |
14 |
15 | def train_tfidf(corpus, min_df, max_features):
16 | tfidfv = TfidfVectorizer(analyzer="word", min_df=min_df, max_features=max_features, use_idf=True)
17 | tfidfv.fit(corpus)
18 | print("feats dim={}".format(len(tfidfv.get_feature_names())))
19 | with open(f"../models/tfidf-model-{min_df}-{max_features}.jlb", "wb") as f:
20 | joblib.dump(tfidfv, f)
21 | return tfidfv
22 |
23 |
24 | def get_tfidf_feats(min_df, max_features, with_test=True):
25 | corpus = trainData["text"].values
26 | if with_test:
27 | corpus = np.concatenate([corpus, pd.read_csv("../data/test_a.csv", sep="\t")["text"].values], axis=0)
28 | print("{} lines corpus to be used".format(len(corpus)))
29 |
30 | tfidfvec = train_tfidf(corpus, min_df, max_features)
31 |
32 | def extract_feats(X):
33 | return tfidfvec.transform(X).toarray()
34 |
35 | labelsData = trainData["label"].values
36 | featsData = extract_feats(trainData["text"].values)
37 |
38 | with open(f"../data/labelsData-{min_df}-{max_features}.jlb", "wb") as f:
39 | joblib.dump(labelsData, f)
40 |
41 | with open(f"../data/featsData-{min_df}-{max_features}.jlb", "wb") as f:
42 | joblib.dump(featsData, f)
43 |
44 | print("labelsData shape={}, featsData shape={}".format(labelsData.shape, featsData.shape))
45 |
46 |
47 | min_df = 2
48 | max_features = 9000
49 | get_tfidf_feats(min_df, max_features)
50 |
--------------------------------------------------------------------------------
/src/04-train-fasttext.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.metrics import f1_score
4 |
5 | from sklearn.model_selection import train_test_split, StratifiedKFold
6 | import fasttext
7 |
8 |
9 | def gen_train_test_file(data, set):
10 | if set == "train":
11 | data["label"] = data["label"].map(lambda x: f"__label__{str(x)}")
12 | traind, devd = train_test_split(data, random_state=42, stratify=data["label"], test_size=0.1)
13 | traind.to_csv(f"../data/train-fasttext.txt", index=False, sep=" ")
14 | devd.to_csv(f"../data/dev-fasttext.txt", index=False, sep=" ")
15 | print("train shape={}".format(traind.shape))
16 | print("dev shape={}".format(devd.shape))
17 | else:
18 | data.to_csv(f"../data/{set}-fasttext.txt", index=False, sep=" ")
19 |
20 |
21 | def evaluate(y_true, y_pred):
22 | return f1_score(y_true, y_pred, average='macro')
23 |
24 |
25 | def write_file():
26 | trainData = pd.read_csv("../data/train_set.csv", sep="\t")
27 | print("trainData shape={}".format(trainData.shape))
28 |
29 | testAData = pd.read_csv("../data/test_a.csv", sep="\t")
30 | print("testAData shape={}".format(testAData.shape))
31 |
32 | gen_train_test_file(trainData, "train")
33 | gen_train_test_file(testAData, "test")
34 |
35 |
36 | def train():
37 | lr = 0.5
38 | epoch = 20
39 | wordNgrams = 2
40 | model = fasttext.train_supervised(input="../data/train-fasttext.txt",
41 | autotuneValidationFile='../data/dev-fasttext.txt',
42 | autotuneDuration=1200)
43 | print("{} words".format(len(model.words)))
44 | print("{} labels={}".format(len(model.labels), model.labels))
45 | model.save_model("../models/fasttext-model.bin")
46 |
47 | X_test = pd.read_csv("../data/test-fasttext.txt")["text"]
48 | y_pred = [model.predict(i)[0][0][9:] for i in X_test]
49 | outs = pd.DataFrame({"label": y_pred})
50 | outs.to_csv(f"../data/test_a_predict-fasttext-{lr}-{epoch}-{wordNgrams}.csv", index=False)
51 |
52 |
53 | def test():
54 | model = fasttext.load_model("../models/fasttext-model.bin")
55 | out = model.test("../data/dev-fasttext.txt")
56 | print(out)
57 |
58 |
59 | write_file()
60 | train()
61 | test()
62 |
--------------------------------------------------------------------------------
/src/03-训练模型-tfidf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | import time
8 |
9 | from sklearn.externals import joblib
10 | from sklearn.preprocessing import StandardScaler
11 | from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
12 | from sklearn.model_selection import GridSearchCV
13 |
14 | from sklearn.ensemble import RandomForestClassifier
15 | import lightgbm as lgb
16 | import xgboost as xgb
17 |
18 | from sklearn.metrics import f1_score
19 |
20 | min_df = 2
21 | max_features = 9000
22 | with open(f"../data/labelsData-{min_df}-{max_features}.jlb", "rb") as f:
23 | labelsData = joblib.load(f)
24 | with open(f"../data/featsData-{min_df}-{max_features}.jlb", "rb") as f:
25 | featsData = joblib.load(f)
26 | print("labelsData shape={}, featsData shape={}".format(labelsData.shape, featsData.shape))
27 |
28 | with open(f"../models/tfidf-model-{min_df}-{max_features}.jlb", "rb") as f:
29 | tfidfvec = joblib.load(f)
30 |
31 | testAData = pd.read_csv("../data/test_a.csv", sep="\t")
32 |
33 |
34 | def extract_feats(X):
35 | return tfidfvec.transform(X).toarray()
36 |
37 |
38 | X_test = extract_feats(testAData["text"].values)
39 |
40 |
41 | def evaluate(y_true, y_pred):
42 | return f1_score(y_true, y_pred, average='macro')
43 |
44 |
45 | def train_test(X_train, y_train, X_test, y_test, model):
46 | model.fit(X_train, y_train)
47 | y_pred = model.predict(X_test)
48 | return evaluate(y_test, y_pred)
49 |
50 |
51 | def run_cross_validate():
52 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
53 | model = RandomForestClassifier(n_estimators=300, n_jobs=16)
54 | results = []
55 | for train_index, test_index in skf.split(featsData, labelsData):
56 | X_train, y_train = featsData[train_index], labelsData[train_index]
57 | X_test, y_test = featsData[test_index], labelsData[test_index]
58 | print("X_train shape={}, y_train shape={}".format(X_train.shape, y_train.shape))
59 | print("X_test shape={}, y_test shape={}".format(X_test.shape, y_test.shape))
60 | f1 = train_test(X_train, y_train, X_test, y_test, model)
61 | results.append(f1)
62 | print("mean={}, f1_macro={}".format(np.mean(results), results))
63 |
64 |
65 | def run_predict(model, name, min_df=3, max_features=5000):
66 | tStart = time.time()
67 | y_pred = model.predict(X_test)
68 | outs = pd.DataFrame({"label": y_pred})
69 | outs.to_csv(f"../data/test_a_predict-{name}-{min_df}-{max_features}.csv", index=False)
70 | print("{}: predict completed, time={}".format(name, time.time() - tStart))
71 |
72 |
73 | def re_train(model, name, X_train, y_train):
74 | tStart = time.time()
75 | model.fit(X_train, y_train)
76 | print("{}: re-train completed, time={}".format(name, time.time() - tStart))
77 | return model
78 |
79 |
80 | def run_cv(model, name, with_cv=True):
81 | if with_cv:
82 | tStart = time.time()
83 | cv = cross_val_score(model, featsData, labelsData, cv=skf, scoring="f1_macro", n_jobs=16)
84 | print("mean={}, f1_macro={}".format(np.mean(cv), cv))
85 | print("{}: cross-validate completed, time={:4f}".format(name, time.time() - tStart))
86 |
87 | # 重新在全部训练集中训练、拟合
88 | model = re_train(model, name, featsData, labelsData)
89 | # 在测试集上进行预测
90 | run_predict(model, name, min_df, max_features)
91 |
92 |
93 | def run_gs_cv(model, name, paras, scoring, X_train, y_train):
94 | print("{}: grid search cv...".format(name))
95 | cv = GridSearchCV(estimator=model,
96 | param_grid=paras,
97 | scoring=scoring,
98 | cv=skf, )
99 | cv.fit(X_train, y_train)
100 | print("cv best estimator: {}".format(cv.best_estimator_))
101 | print("cv best paras: {}".format(cv.best_params_))
102 | print("cv best score({})={}".format(scoring, cv.best_score_))
103 |
104 | # 重新在全部训练集中训练、拟合
105 | model = re_train(model, name, featsData, labelsData)
106 | # 在测试集上进行预测
107 | run_predict(model, name, min_df, max_features)
108 | return
109 |
110 |
111 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
112 |
113 | # rfcls = RandomForestClassifier(n_estimators=400, n_jobs=16)
114 | # run_cv(rfcls, "rf")
115 |
116 | lgbmcls = lgb.LGBMClassifier(objective='multi-class', random_state=42, n_jobs=16)
117 | paras = {
118 | 'learning_rate': [0.001, 0.01, 0.05, 0.1],
119 | 'boosting_type': ["gbdt", "rf"],
120 | 'n_estimators': [100, 300, 500],
121 | 'max_depth': [10, 15, 20, 25],
122 | 'subsample': [0.2, 0.4, 0.6, 0.8, 1.0],
123 | }
124 | run_gs_cv(lgbmcls, "lgbm-gs", paras, "f1_macro", featsData, labelsData)
125 | # run_cv(lgbmcls, "lgbm")
126 |
--------------------------------------------------------------------------------
/notebooks/02-特征提取-tfidf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pickle\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from sklearn.feature_extraction.text import TfidfVectorizer"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "200000 train data\n",
25 | "250000 all data\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "corpus=pd.read_csv(\"../data/train_set.csv\", sep=\"\\t\")[\"text\"].values\n",
31 | "print(\"{} train data\".format(len(corpus)))\n",
32 | "corpus=np.concatenate([corpus, pd.read_csv(\"../data/test_a.csv\", sep=\"\\t\")[\"text\"].values], axis=0)\n",
33 | "print(\"{} all data\".format(len(corpus)))"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/plain": [
44 | "'2967 6758 339 2021 1854 3731 4109 3792 4149 1519 2058 3912 2465 2410 1219 6654 7539 264 2456 4811 1292 2109 6905 5520 7058 6045 3634 6591 3530 6508 2465 7044 1519 3659 2073 3750 3731 4109 3792 6831 2614 3370 4269 3370 486 5770 4109 4125 3750 5445 2466 6831 6758 3743 3630 1726 2313 5906 826 4516 657 900 1871 7044 3750 2967 3731 1757 1939 648 2828 4704 7039 3706 3750 965 2490 7399 3743 2145 2407 7451 3775 6017 5998 1641 299 4704 2621 7029 3056 6333 433 648 1667 1099 900 2289 1099 648 5780 220 7044 1279 7426 4269 3750 2967 6758 6631 3099 2205 7305 2620 5977 3750 3329 1793 6666 2042 3193 4149 1519 7039 3706 2446 5399 648 4124 2058 3912 248 3193 2252 5649 2212 4939 7239 3310 4525 2400 900 5770 4109 4125 7044 4921 265 1397 4699 1699 669 6407 3750 1271 1271 4741 669 4659 3870 4030 4167 5338 25 3466 6909 4417 1859 3750 1465 7194 648 3938 1571 848 6986 827 2124 3750 1991 7444 7037 2729 908 6308 3750 1889 6810 4190 591 5598 2289 2109 6831 6407 2400 5410 517 900 25 3731 4109 3792 4128 1679 4811 4853 4109 3630 6902 6122 1903 1736 3915 2602 6822 3750 6630 4265 591 729 4448 648 1465 1401 4853 648 5881 6182 4128 1679 4939 2646 652 340 7328 1320 900 1460 619 5505 2376 4853 3272 3750 4853 4109 3630 6902 3362 2810 3750 803 1985 4128 669 19 6508 900 1635 1871 7377 6122 6017 3750 2289 1099 3938 1571 7509 1375 5393 5589 5037 2115 4707 5310 6811 6093 900 7399 2410 1219 6654 3263 6017 3750 5998 4939 5971 4148 3750 803 1985 7194 4780 796 6038 4231 648 1722 6407 3750 1099 6485 1920 1767 5915 6518 6093 5598 5648 4280 900 7326 6242 5328 1214 3870 1985 7194 5998 5741 2115 913 5950 3800 1538 686 6734 6017 3750 1985 3659 1324 5814 4998 5176 535 7399 307 4068 486 1667 1099 2121 6407 3750 7420 3099 6038 4231 4190 1519 3255 7123 4305 3231 1635 4822 1722 3750 2967 3731 1757 1939 648 473 6518 2400 2614 5330 5530 1394 4939 1903 7495 7239 900 4469 5530 4704 299 7467 2121 669 5693 3750 3618 299 5264 4853 1734 316 2828 5445 4190 4939 3484 6043 2376 1031 761 900 5370 3782 2210 669 2210 3099 1363 6301 3508 1907 2410 7509 5718 541 3750 803 2967 6758 3038 6641 1985 7194 512 4811 6811 5243 2112 3750 1734 2376 2891 1211 648 7257 4148 7159 1667 3750 5816 4202 2400 5864 3915 7399 3414 1667 5977 7327 7256 2935 4936 1667 2151 900 6831 4599 6182 3227 3859 3099 7509 7256 3750 1985 7194 4128 4691 2029 1344 6630 5598 1465 648 3706 7403 543 3038 900 1985 7194 3800 980 6017 980 4124 648 900 1635 3605 5028 3731 4109 3792 1866 3578 3915 648 4939 1335 6666 6560 3750 3618 3508 1907 2410 1913 6656 3750 2828 4704 4998 4939 7039 3915 4167 5338 3750 803 1985 4939 3263 7123 264 2456 5689 2109 648 3750 6093 1699 5589 4411 1866 4750 648 1667 1099 3000 7420 1279 2975 1141 7148 3750 1985 3915 2570 4936 5998 1877 3000 7420 900 1635 5470 2313 5864 641 4333 3750 3915 5659 316 2828 2770 5176 803 2047 7532 606 6980 1635 3750 803 1750 7039 3800 7245 3099 7509 5839 3750 1866 1401 4321 5788 1519 6122 6405 4939 5998 2729 900 1985 7194 5998 2289 2107 1519 1592 316 2828 1679 4811 5461 3324 4525 4052 3750 2212 742 3750 1985 7194 6631 1335 5445 3750 1465 7194 4128 6887 4819 5977 3223 2717 900 5612 5948 3750 1985 7194 2289 913 3800 4811 6122 2614 2047 7532 606 6980 900 1985 2541 4409 3772 6012 1833 5560 4173 6662 414 340 316 4125 4128 3800 669 6575 4819 5977 900 1635 25 1460 619 7044 4921 648 4407 3800 1241 600 3750 5470 2313 641 4333 7539 803 316 4125 648 3530 6637 569 1985 3000 4659 5610 6917 3750 3618 1985 6887 7010 3870 900 3915 4939 7010 3870 5598 1985 1394 3397 5598 900 1635 1460 619 5708 1335 6518 4148 3750 2410 1219 6654 2252 1702 5598 803 4646 2109 6905 5520 1635 2663 885 5491 1465 4822 1722 5011 2376 4149 1903 2662 3750 803 316 2828 1767 5915 6065 2042 1335 5598 3750 2688 5598 3231 5780 7399 3750 4811 5788 1292 1641 1667 1099 4811 5393 6407 5708 6631 1335 6666 900 316 4125 4811 648 4939 6678 3750 2021 1726 340 4469 4842 4128 669 5393 4801 3154 3750 5780 7399 669 3915 544 62 5602 1913 5598 3750 3859 6759 4939 4646 1913 900 1635 1767 5915 6065 4464 5814 648 2410 1219 6654 1815 1699 6038 4231 5698 1375 62 307 3750 803 299 5264 1460 316 2828 5445 3750 1985 3414 1667 7509 3223 3750 5998 4939 669 2364 2975 648 900 1985 3038 5938 5168 3770 1667 3750 2717 368 5693 7117 3750 1985 2131 6909 2192 1141 6831 6015 900 3864 7194 1375 5393 1815 1699 1985 5780 7399 5681 3099 5176 3870 5598 3750 1985 3038 3771 6630 7159 1667 900 1635 5659 7377 3166 5445 3750 1793 6666 648 2614 5736 5537 5526 4128 6887 4811 495 6386 900 1465 7194 1767 5659 2410 1219 6654 340 1362 1829 2304 3193 6822 3750 5330 5264 4321 3750 4173 5619 4109 6227 648 5915 6515 4893 5957 6043 3750 5949 4411 5410 1991 4128 826 2490 3193 2602 3750 803 1985 7194 4516 5264 1394 3800 5659 3731 4109 3792 5081 2918 3750 5051 1985 5612 19 3750 3731 4109 3792 5718 7239 3193 6822 900 1635 7377 5736 3750 2205 7305 2620 2042 5192 1745 3605 6887 5278 299 648 5651 7440 1656 3630 1702 3300 7539 803 1985 340 3731 4109 3792 4190 4811 4464 1519 5778 3166 3750 1985 3038 6235 7399 5998 2313 900 1635 25 910 619 4939 1613 248 3193 4741 4893 3750 2967 3731 1757 1939 648 7495 5028 5949 4939 7539 803 4811 2255 3915 3750 1394 4741 900 6887 2255 3915 3750 1394 669 4741 900 1635'"
45 | ]
46 | },
47 | "execution_count": 3,
48 | "metadata": {},
49 | "output_type": "execute_result"
50 | }
51 | ],
52 | "source": [
53 | "corpus[0]"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 13,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | "shape=(250000, 5000)\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "tfidfv = TfidfVectorizer(analyzer=\"word\", min_df=3, max_features=5000, use_idf=True)\n",
71 | "def train_tfidf(corpus):\n",
72 | " out=tfidfv.fit_transform(corpus)\n",
73 | " print(\"shape={}\".format(out.toarray().shape))\n",
74 | " with open(\"../models/tfidf-model.pkl\", \"wb\") as f:\n",
75 | " pickle.dump(tfidfv, f)\n",
76 | "\n",
77 | "train_tfidf(corpus)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 14,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | "5000"
89 | ]
90 | },
91 | "execution_count": 14,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "len(tfidfv.get_feature_names())"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": []
106 | }
107 | ],
108 | "metadata": {
109 | "kernelspec": {
110 | "display_name": "Python 3",
111 | "language": "python",
112 | "name": "python3"
113 | },
114 | "language_info": {
115 | "codemirror_mode": {
116 | "name": "ipython",
117 | "version": 3
118 | },
119 | "file_extension": ".py",
120 | "mimetype": "text/x-python",
121 | "name": "python",
122 | "nbconvert_exporter": "python",
123 | "pygments_lexer": "ipython3",
124 | "version": "3.7.6"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 4
129 | }
130 |
--------------------------------------------------------------------------------
/notebooks/03-训练模型-tfidf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "\n",
12 | "import pickle\n",
13 | "\n",
14 | "from sklearn.preprocessing import StandardScaler\n",
15 | "from sklearn.model_selection import train_test_split, KFold, StratifiedKFold\n",
16 | "\n",
17 | "from sklearn.ensemble import RandomForestClassifier\n",
18 | "\n",
19 | "from sklearn.metrics import f1_score"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "name": "stdout",
29 | "output_type": "stream",
30 | "text": [
31 | "data shape=(200000, 2)\n"
32 | ]
33 | },
34 | {
35 | "data": {
36 | "text/html": [
37 | "
\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " | \n",
55 | " label | \n",
56 | " text | \n",
57 | "
\n",
58 | " \n",
59 | " \n",
60 | " \n",
61 | " | 0 | \n",
62 | " 2 | \n",
63 | " 2967 6758 339 2021 1854 3731 4109 3792 4149 15... | \n",
64 | "
\n",
65 | " \n",
66 | " | 1 | \n",
67 | " 11 | \n",
68 | " 4464 486 6352 5619 2465 4802 1452 3137 5778 54... | \n",
69 | "
\n",
70 | " \n",
71 | " | 2 | \n",
72 | " 3 | \n",
73 | " 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... | \n",
74 | "
\n",
75 | " \n",
76 | "
\n",
77 | "
"
78 | ],
79 | "text/plain": [
80 | " label text\n",
81 | "0 2 2967 6758 339 2021 1854 3731 4109 3792 4149 15...\n",
82 | "1 11 4464 486 6352 5619 2465 4802 1452 3137 5778 54...\n",
83 | "2 3 7346 4068 5074 3747 5681 6093 1777 2226 7354 6..."
84 | ]
85 | },
86 | "execution_count": 2,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "trainData=pd.read_csv(\"../data/train_set.csv\", sep=\"\\t\")\n",
93 | "print(\"data shape={}\".format(trainData.shape))\n",
94 | "trainData.head(3)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stderr",
104 | "output_type": "stream",
105 | "text": [
106 | "C:\\ProgramData\\Anaconda3\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
107 | " return f(*args, **kwds)\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "with open(\"../models/tfidf-model.pkl\", \"rb\") as f:\n",
113 | " tfidfv=pickle.load(f)\n",
114 | "\n",
115 | "def get_feats(X):\n",
116 | " return tfidfv.transform(X).toarray()\n",
117 | "\n",
118 | "def evaluate(y_true, y_pred):\n",
119 | " return f1_score(y_true, y_pred, average='macro')\n",
120 | "\n",
121 | "def train_test(X_train, y_train, X_test, y_test, model):\n",
122 | " model.fit(X_train, y_train)\n",
123 | " y_pred=model.predict(X_test)\n",
124 | " return evaluate(y_test, y_pred)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 6,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "labelsData=trainData[\"label\"]\n",
134 | "featsData=get_feats(trainData[\"text\"].values)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 7,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "labelsData shape=(200000,), featsData shape=(200000, 5000)\n"
147 | ]
148 | }
149 | ],
150 | "source": [
151 | "print(\"labelsData shape={}, featsData shape={}\".format(labelsData.shape, featsData.shape))"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 9,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "X shape=(160000, 5000), y shape=(160000,)\n",
164 | "X shape=(160000, 5000), y shape=(160000,)\n",
165 | "X shape=(160000, 5000), y shape=(160000,)\n",
166 | "X shape=(160000, 5000), y shape=(160000,)\n",
167 | "X shape=(160000, 5000), y shape=(160000,)\n"
168 | ]
169 | },
170 | {
171 | "ename": "AttributeError",
172 | "evalue": "'list' object has no attribute 'mean'",
173 | "output_type": "error",
174 | "traceback": [
175 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
176 | "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
177 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 11\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"mean={}, f1={}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
178 | "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'mean'"
179 | ]
180 | }
181 | ],
182 | "source": [
183 | "skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
184 | "model = RandomForestClassifier(n_estimators=100)\n",
185 | "results=[]\n",
186 | "for train_index, test_index in skf.split(featsData, labelsData):\n",
187 | " X_train, y_train = featsData[train_index], labelsData[train_index]\n",
188 | " print(\"X shape={}, y shape={}\".format(X_train.shape, y_train.shape))\n",
189 | " X_test, y_test = featsData[test_index], labelsData[test_index]\n",
190 | " f1=train_test(X_train, y_train, X_test, y_test, model)\n",
191 | " results.append(f1)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 10,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "mean=0.8370154885117678, f1=[0.8339345059636758, 0.8336630320405153, 0.8363794884804777, 0.8406588993783254, 0.8404415166958447]\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "print(\"mean={}, f1={}\".format(np.mean(results), results))"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": []
217 | }
218 | ],
219 | "metadata": {
220 | "kernelspec": {
221 | "display_name": "Python 3",
222 | "language": "python",
223 | "name": "python3"
224 | },
225 | "language_info": {
226 | "codemirror_mode": {
227 | "name": "ipython",
228 | "version": 3
229 | },
230 | "file_extension": ".py",
231 | "mimetype": "text/x-python",
232 | "name": "python",
233 | "nbconvert_exporter": "python",
234 | "pygments_lexer": "ipython3",
235 | "version": "3.7.6"
236 | }
237 | },
238 | "nbformat": 4,
239 | "nbformat_minor": 4
240 | }
241 |
--------------------------------------------------------------------------------
/notebooks/01-数据处理.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 18,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "data shape=(200000, 2)\n"
22 | ]
23 | },
24 | {
25 | "data": {
26 | "text/html": [
27 | "\n",
28 | "\n",
41 | "
\n",
42 | " \n",
43 | " \n",
44 | " | \n",
45 | " label | \n",
46 | " text | \n",
47 | "
\n",
48 | " \n",
49 | " \n",
50 | " \n",
51 | " | 0 | \n",
52 | " 2 | \n",
53 | " 2967 6758 339 2021 1854 3731 4109 3792 4149 15... | \n",
54 | "
\n",
55 | " \n",
56 | " | 1 | \n",
57 | " 11 | \n",
58 | " 4464 486 6352 5619 2465 4802 1452 3137 5778 54... | \n",
59 | "
\n",
60 | " \n",
61 | " | 2 | \n",
62 | " 3 | \n",
63 | " 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... | \n",
64 | "
\n",
65 | " \n",
66 | "
\n",
67 | "
"
68 | ],
69 | "text/plain": [
70 | " label text\n",
71 | "0 2 2967 6758 339 2021 1854 3731 4109 3792 4149 15...\n",
72 | "1 11 4464 486 6352 5619 2465 4802 1452 3137 5778 54...\n",
73 | "2 3 7346 4068 5074 3747 5681 6093 1777 2226 7354 6..."
74 | ]
75 | },
76 | "execution_count": 18,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "trainData=pd.read_csv(\"../data/train_set.csv\", sep=\"\\t\")\n",
83 | "print(\"data shape={}\".format(trainData.shape))\n",
84 | "trainData.head(3)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 21,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "data shape=(50000, 1)\n"
97 | ]
98 | },
99 | {
100 | "data": {
101 | "text/html": [
102 | "\n",
103 | "\n",
116 | "
\n",
117 | " \n",
118 | " \n",
119 | " | \n",
120 | " text | \n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " | 0 | \n",
126 | " 5399 3117 1070 4321 4568 2621 5466 3772 4516 2... | \n",
127 | "
\n",
128 | " \n",
129 | " | 1 | \n",
130 | " 2491 4109 1757 7539 648 3695 3038 4490 23 7019... | \n",
131 | "
\n",
132 | " \n",
133 | " | 2 | \n",
134 | " 2673 5076 6835 2835 5948 5677 3247 4124 2465 5... | \n",
135 | "
\n",
136 | " \n",
137 | "
\n",
138 | "
"
139 | ],
140 | "text/plain": [
141 | " text\n",
142 | "0 5399 3117 1070 4321 4568 2621 5466 3772 4516 2...\n",
143 | "1 2491 4109 1757 7539 648 3695 3038 4490 23 7019...\n",
144 | "2 2673 5076 6835 2835 5948 5677 3247 4124 2465 5..."
145 | ]
146 | },
147 | "execution_count": 21,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "testAData=pd.read_csv(\"../data/test_a.csv\", sep=\"\\t\")\n",
154 | "print(\"data shape={}\".format(testAData.shape))\n",
155 | "testAData.head(3)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 20,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "14 labels\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "label2id={'科技': 0, '股票': 1, '体育': 2, '娱乐': 3, '时政': 4, '社会': 5, '教育': 6, '财经': 7, '家居': 8, '游戏': 9, '房产': 10, '时尚': 11, '彩票': 12, '星座': 13}\n",
173 | "print(\"{} labels\".format(len(label2id)))"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "# 数据分析"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 22,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "trainData[\"length\"]=trainData[\"text\"].map(lambda x: len(x.split(\" \")))"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 27,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/plain": [
200 | "0 38918\n",
201 | "1 36945\n",
202 | "2 31425\n",
203 | "3 22133\n",
204 | "4 15016\n",
205 | "5 12232\n",
206 | "6 9985\n",
207 | "7 8841\n",
208 | "8 7847\n",
209 | "9 5878\n",
210 | "10 4920\n",
211 | "11 3131\n",
212 | "12 1821\n",
213 | "13 908\n",
214 | "Name: label, dtype: int64"
215 | ]
216 | },
217 | "execution_count": 27,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "trainData[\"label\"].value_counts()"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 26,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | ""
235 | ]
236 | },
237 | "execution_count": 26,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | },
241 | {
242 | "data": {
243 | "image/png": "\n",
244 | "text/plain": [
245 | ""
246 | ]
247 | },
248 | "metadata": {
249 | "needs_background": "light"
250 | },
251 | "output_type": "display_data"
252 | }
253 | ],
254 | "source": [
255 | "trainData[\"label\"].value_counts().plot(kind=\"bar\")"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 28,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "data": {
265 | "text/plain": [
266 | "321 220\n",
267 | "500 214\n",
268 | "452 213\n",
269 | "316 211\n",
270 | "252 208\n",
271 | " ... \n",
272 | "5797 1\n",
273 | "5859 1\n",
274 | "5925 1\n",
275 | "6053 1\n",
276 | "6247 1\n",
277 | "Name: length, Length: 5633, dtype: int64"
278 | ]
279 | },
280 | "execution_count": 28,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "trainData[\"length\"].value_counts()"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 29,
292 | "metadata": {},
293 | "outputs": [
294 | {
295 | "data": {
296 | "text/plain": [
297 | "2538 2506 1363 5466 3772 340 922 433 2397 5778 6656 6609 6122 2525 1702 7495 2435 62 3300 7539 5330 3568 5413 619 3648 5370 3530 2446 459 4167 3750 6740 1699 7543 6930 4893 2444 1460 6122 1679 5036 7363 648 1903 4893 541 6104 340 3263 5949 3750 4969 1279 2364 6101 648 7495 5801 3530 6508 3750 7495 2435 669 6333 2538 4822 6833 1722 2112 340 4531 541 2112 3750 6122 3731 4811 910 6759 5036 648 4811 2121 4409 6286 3750 5948 4679 4650 2693 6713 5612 648 151 2446 1699 1952 3750 4017 920 5096 2539 6045 478 7239 1241 3056 900 5028 4151 7539 2397 5778 922 433 2471 5011 7539 2465 2109 619 4464 1324 4741 4958 7055 2465 1324 6252 4464 1324 2380 1854 7055 6644 1324 6252 1324 2400 2112 7055 6644 2799 6252 6644 1324 2984 1854 7055 4063 619 6644 2799 7134 2376 900 2465 4677 3775 2252 1702 922 885 1633 1633 1985 3915 922 885 11\n",
298 | "2538 2506 1363 1913 4148 1903 5036 1866 7495 1327 94 7495 2435 62 3300 7539 5330 3568 5413 619 3648 5370 3530 2446 459 4167 3750 6740 1699 7543 6930 4893 2444 1460 6122 1679 5036 7363 648 1903 4893 541 6104 340 3263 5949 3750 4969 1279 2364 6101 648 7495 5801 3530 6508 3750 7495 2435 669 6333 2538 4822 6833 1722 2112 340 4531 541 2112 3750 6122 3731 4811 910 6759 5036 648 4811 2121 4409 6286 3750 5948 4679 4650 2693 6713 5612 648 151 2446 1699 1952 3750 4017 920 5096 2539 6045 478 7239 1241 3056 900 2538 2506 1363 1913 4148 1903 5036 465 2004 1866 7495 922 433 4677 3775 2252 1702 922 885 1633 1633 1985 3915 922 885 8\n",
299 | "2538 2506 1363 5466 3772 340 922 433 2397 5778 6656 6609 6122 2525 1702 7495 2435 62 3300 7539 5330 3568 5413 619 3648 5370 3530 2446 459 4167 3750 6740 1699 7543 6930 4893 2444 1460 6122 1679 5036 7363 648 1903 4893 541 6104 340 3263 5949 3750 4969 1279 2364 6101 648 7495 5801 3530 6508 3750 7495 2435 669 6333 2538 4822 6833 1722 2112 340 4531 541 2112 3750 6122 3731 4811 910 6759 5036 648 4811 2121 4409 6286 3750 5948 4679 4650 2693 6713 5612 648 151 2446 1699 1952 3750 4017 920 5096 2539 6045 478 7239 1241 3056 900 5028 4151 7539 2397 5778 922 433 2471 5011 7539 2465 2109 619 4464 1324 4741 4958 7055 2465 1324 6252 4464 1324 2380 1854 7055 6644 1324 6252 1324 2400 2112 7055 6644 2799 6252 6644 1324 2984 1854 7055 4063 619 6644 2799 7134 2376 900 2465 6\n",
300 | "2538 2506 1363 5466 3772 340 922 433 2397 5778 6656 6609 6122 2525 1702 5028 4151 7539 2397 5778 922 433 2471 5011 7539 2465 2109 619 4464 1324 4741 4958 7055 2465 1324 6252 4464 1324 2380 1854 7055 6644 1324 6252 1324 2400 2112 7055 6644 2799 6252 6644 1324 2984 1854 7055 4063 619 6644 2799 7134 2376 900 2465 1633 1633 1633 1767 4811 1915 1731 4612 4340 847 6770 1915 3007 922 885 1633 1633 1985 3915 922 885 4\n",
301 | "2538 2506 1363 1913 4148 1903 5036 1866 7495 1327 94 7495 2435 62 3300 7539 5330 3568 5413 619 3648 5370 3530 2446 459 4167 3750 6740 1699 7543 6930 4893 2444 1460 6122 1679 5036 7363 648 1903 4893 541 6104 340 3263 5949 3750 4969 1279 2364 6101 648 7495 5801 3530 6508 3750 7495 2435 669 6333 2538 4822 6833 1722 2112 340 4531 541 2112 3750 6122 3731 4811 910 6759 5036 648 4811 2121 4409 6286 3750 5948 4679 4650 2693 6713 5612 648 151 2446 1699 1952 3750 4017 920 5096 2539 6045 478 7239 1241 3056 900 2538 2506 1363 1913 4148 1903 5036 465 2004 1866 7495 922 433 4\n",
302 | " ..\n",
303 | "7436 7010 2828 2515 3374 3961 5190 1299 4562 736 4499 6065 5724 1302 5330 3530 134 2936 5491 7058 6045 2465 1362 1242 7362 2662 1699 7219 2022 1324 2073 4464 4149 4853 1070 5602 4464 2211 408 671 1407 4409 4853 3750 2400 4411 7436 7010 7309 2549 151 2859 4650 2512 7543 151 2859 5999 4853 2828 2515 4411 1141 5560 1914 3860 2289 2109 1407 4802 5936 2685 5389 2810 3750 5948 3068 1348 1699 5560 2400 4399 5560 1744 4109 5560 6501 4109 5560 5604 3117 5560 2107 5165 5560 5858 2364 1277 1299 913 3750 5176 2770 5598 803 1211 7256 5724 6065 5620 3317 3750 6983 7444 299 4559 5718 1121 1635 6831 761 2471 6798 3317 7010 900 7377 1308 6093 1699 7255 1511 4562 2828 2515 6065 5724 3374 3961 5190 1299 4562 736 4499 6065 5724 3374 3893 648 7495 5777 3607 900 7399 5999 4853 648 5296 5397 1348 7309 2614 803 1211 7256 5724 6065 5620 3317 3750 6983 7444 299 4559 5718 1121 1635 3317 7010 2400 3750 4650 2512 7436 7010 4190 1308 2210 5598 2109 4298 6966 7449 2602 6822 4030 6101 3750 4650 2512 6357 7486 1407 2463 5491 736 6966 7449 4464 3370 3370 3370 462 6770 5271 4409 6286 465 2662 5560 4650 2512 6357 1389 5511 7417 1991 3630 5538 4811 330 151 2859 5491 736 6966 7449 1324 3370 3370 3370 462 5724 6065 299 4559 6227 2662 5560 4650 2512 6357 1334 1351 4559 3630 5538 4811 330 151 2859 5491 736 6966 7449 3659 3370 3370 462 5724 6065 299 4559 6227 2662 1277 5659 4650 2512 7436 7010 2602 5935 5598 5724 6065 3374 3893 1629 6980 6966 7449 5105 2791 900 1985 4411 6038 6831 4744 7495 648 6770 5271 3374 3893 2490 5906 5977 3859 5430 619 4650 2512 1407 6407 2935 4779 5560 1987 1334 4409 6286 1695 5526 1277 3223 1903 1582 3134 900 5051 4205 3750 803 5724 6065 1635 4939 2400 4411 7436 7010 648 6065 5724 3893 4396 5338 1879 900 7436 7010 7377 5677 2471 6798 3317 7010 2490 6831 3915 4499 3097 6770 5271 2461 6101 648 5724 6065 299 4559 5330 5560 299 4559 6227 5560 4409 6286 465 5560 3154 101 2515 465 340 3154 101 2738 3770 1277 900 1\n",
304 | "794 5573 1844 4923 5589 1667 5915 6178 6248 913 7539 4149 3370 1141 2106 4462 4381 6637 5702 5778 3641 1362 2970 7110 2465 5330 3530 134 2465 7399 4464 3659 2073 3659 3370 4853 1460 5736 5915 6178 2024 6713 2109 6248 4411 4181 4562 6248 3067 6831 5498 648 803 5296 6319 1241 3484 2400 648 408 671 5915 6178 1635 885 948 299 6220 794 5573 1844 4923 5589 1667 5915 6178 6248 913 1146 2119 7127 281 4301 2465 6293 6250 2465 3661 3586 2119 1361 3809 1066 4902 2119 3694 1702 3300 4381 6637 1264 619 5264 3272 4516 2990 1693 4166 6220 7154 5426 3780 2984 3374 2522 5948 2429 6996 5589 1667 4462 2674 4462 4381 3578 6909 6980 648 4516 6390 900 1633 1633 1633 1633 2541 910 3530 6508 7539 1633 1633 1633 1633 4411 4181 4381 6637 6405 1620 6065 1324 1141 2106 1633 1633 1633 1633 4381 6637 4516 6656 265 3691 2087 730 339 7212 2465 4381 1407 2120 5778 1334 4298 6630 4659 910 478 1633 1633 1633 1633 2210 3961 4893 2444 3335 7261 4151 1519 2379 6012 4381 6637 5702 4811 7212 5122 6065 6065 7186 1633 1633 1633 1633 4381 6637 4659 751 4190 3800 6920 3223 318 4648 1633 1633 1633 1633 5968 4381 1274 5938 349 5977 4381 6637 669 62 307 1633 1633 1633 1633 4151 1519 4411 4181 4381 6637 3504 6637 6920 6810 5602 1324 1141 2106 1633 1633 1633 1633 5096 2539 6045 5139 5410 347 1592 2465 4411 4181 4381 6637 4981 4063 5445 3484 6920 4646 7186 1633 1633 1633 1633 4381 6637 4516 6656 265 3691 1987 1334 2087 730 339 7212 1\n",
305 | "5505 5165 6682 2515 1141 4411 6333 2410 4409 7546 4630 4646 4230 1141 2106 5036 2151 5096 2539 6835 5296 367 7058 6045 2465 6325 4005 5505 5165 4411 4181 5659 1141 4411 6333 2410 4409 5296 6319 1844 4923 5999 4853 7399 1779 2602 4861 3706 6602 5778 192 4417 3563 5920 3750 314 5858 2490 6093 133 6122 1641 5036 2151 5096 2539 6835 5296 3750 1215 4671 6846 1699 4646 4230 1141 2106 900 5051 4205 3750 6333 2410 4409 340 5505 5165 2490 7543 5328 2376 2539 1324 4230 1141 2106 340 4464 4230 1141 2106 3750 6333 2410 4409 2490 6093 1699 5778 5950 2461 6835 5296 648 4811 330 5778 5950 4893 5491 1146 1647 2662 3750 25 5505 5165 5864 2490 1214 2466 3012 6854 5096 2539 872 1660 648 3117 1363 5778 5950 4893 5491 5724 1647 2662 900 6333 2410 4409 1702 3300 3750 5998 669 6740 4939 151 2859 3495 4220 299 3634 1070 2107 5858 2539 3374 5640 3641 6045 62 7486 648 1866 2109 3440 936 6835 5296 5096 4958 3750 3166 5445 4128 4939 6333 2410 4409 4167 5410 7399 2400 4411 6748 6122 648 6122 936 7377 1605 7467 4958 900 5505 5165 1844 4923 431 5537 5689 7038 7309 2346 5864 4333 3750 314 5858 7377 5677 5778 192 4939 5505 5165 4411 4181 4576 3706 6602 648 7495 2313 7160 3093 3750 5338 5803 7123 5505 5165 2539 3374 5640 3641 3961 5526 1080 4958 7495 1693 4166 900 314 5858 3504 1702 3300 3750 1460 2400 4411 5915 6178 6093 5689 5948 3068 2400 4411 2539 5330 6357 2614 1279 2975 4811 4409 3870 3750 5282 6017 5096 2539 6831 761 3263 4124 2087 730 3893 1582 3134 900 6333 2410 4409 4939 1141 4411 1070 1036 2109 1304 3056 151 2859 3750 5573 4516 5640 3641 2539 3374 2304 4149 1324 3370 3370 4230 1141 2106 3750 5589 1667 1304 3056 6333 3440 4986 2304 3659 6250 3700 4149 6242 4230 1141 2106 900 5505 5165 1844 4923 4939 2400 4411 1866 2109 150 1511 2397 5778 670 3961 5505 5165 4411 4181 4811 330 151 2859 648 1252 151 2859 3750 5095 4811 4114 3198 5560 3523 3686 3374 5176 2252 5560 6615 2328 5560 2164 3961 5560 5936 2685 5788 2109 3374 3961 3068 3706 6602 5096 2539 3961 5526 3750 5573 4516 4811 5505 5165 4114 3198 5560 5505 3686 5560 4105 6615 5036 4216 1277 3223 913 299 6357 151 2859 900 4677 3775 2252 1702 922 885 1633 1633 1985 3915 922 885 1\n",
306 | "3912 7261 2463 5602 2073 6734 2252 4211 5724 5271 1647 1277 5235 4269 5011 5051 5330 3530 134 2465 5491 7058 6045 2465 6046 2970 659 2662 7399 5736 671 648 383 4040 641 2400 3750 3912 7261 2463 5011 5051 5938 349 965 1080 6122 6050 900 4411 913 3912 7261 2463 5999 4853 1702 3300 3750 5057 3659 3370 4464 4464 1519 5602 2073 4216 6734 3750 1460 5736 151 4211 4411 5619 5620 3374 1215 94 5491 5724 5271 1647 2662 5560 4671 6846 5948 299 2597 3961 2380 23 94 5560 4122 1906 2539 3374 5096 2539 5491 669 6625 3501 2147 2662 5560 2205 3800 2087 730 3893 5936 2685 1215 4986 4321 1605 3912 7261 6832 5338 648 5235 4269 5011 5051 900 1375 606 4124 2402 993 3945 3272 5915 6178 5681 4576 3912 7261 2463 5999 4853 1702 3300 3750 5659 3166 4269 2029 4480 648 1460 4269 6835 3272 4939 299 1519 3166 3272 669 3166 3750 5235 4269 2029 4480 4939 2541 5347 2289 3272 1460 4269 648 450 6929 3750 1375 5948 606 4124 3686 2402 993 3945 3272 5915 6178 5681 4576 3750 4939 4411 4181 299 151 1815 648 1080 2210 3945 3272 5915 6178 3015 4148 7543 6930 648 4811 2121 2597 4326 900 5051 5598 6453 3750 5689 3272 5948 6017 3750 1985 4411 648 2109 2986 7543 2073 4480 340 2621 4480 3912 7261 6832 5338 1679 7261 663 3166 4269 2029 4480 3750 6887 4811 5235 4269 2029 4480 900 3912 7261 2463 2541 910 3012 6854 4893 5999 4853 1344 6630 7058 6045 6966 4053 5445 1702 3300 3750 7327 5640 3166 4269 2029 4480 3824 619 7261 663 3750 3618 2499 648 3129 6104 4939 2289 1903 1460 4269 5445 3272 1920 7051 606 5689 3750 2073 4480 5011 5051 1920 7051 4464 4464 1903 2073 3750 2621 4480 5011 5051 1920 7051 6065 1903 2621 4480 3750 669 3824 619 2402 993 1401 3272 5681 7010 3015 4148 900 4411 913 3912 7261 2463 2252 4211 648 5011 5051 3750 2975 3615 4659 4411 5619 5736 6017 6038 6248 671 1277 1582 3134 648 383 4040 3750 64 3220 1699 6122 2489 3912 7261 5011 5051 648 3129 4355 3750 1816 3099 6248 671 3648 5370 2663 541 3397 4148 5445 2396 7399 669 1952 3695 2112 3750 4822 2400 5235 4269 5011 5051 1394 4939 4417 6630 910 478 648 5011 5051 4525 6122 900 3912 7261 2463 4128 3223 5677 1702 3300 2490 1460 3912 7261 5858 5949 1080 2210 2192 5168 340 4779 2364 900 3659 3370 4464 3370 1519 6985 3750 3912 7261 2463 1702 3300 3750 2490 2252 4211 512 2402 993 3015 4148 648 5235 4269 5011 5051 3750 4969 1767 5915 3695 1906 2541 3859 648 3912 7261 5858 5949 3750 7377 5977 1080 2210 5598 3659 3370 4464 3370 1519 1699 3272 6122 1519 648 5619 2986 3607 663 900 3912 7261 2463 2541 910 3012 6854 4893 5999 4853 3097 4636 3750 5235 4269 5011 5051 6122 3762 3578 3915 1080 2210 2621 5139 6656 6609 3750 6467 6535 2621 5139 3220 1919 648 4498 2315 7055 64 1363 2975 6040 5955 4516 3750 5235 4269 3912 7261 5011 5051 648 600 7010 868 4480 2109 619 3166 4269 3912 7261 5011 5051 648 600 7010 868 4480 3750 3220 7377 7399 1816 5430 5235 4269 5011 5051 1080 2210 5915 6178 7543 6930 5445 3750 669 3859 6740 3263 6122 1903 2073 4480 5787 2621 4480 648 5235 4269 5011 5051 3750 25 4939 3859 6759 2042 3193 541 3299 6122 4166 5445 1920 648 5235 4269 5011 5051 900 4516 5264 5788 151 4211 6122 2621 4480 5011 5051 7366 7037 4411 913 3912 7261 2463 648 5011 5051 2252 4211 4853 7160 2967 4291 3750 4516 5264 5788 5491 5602 2073 4464 1324 4853 2662 2490 151 4211 6122 2621 4480 5915 6178 5011 5051 3750 3335 7261 2211 5445 2490 7039 5677 2252 4211 5235 4269 5011 5051 900 4167 5410 1985 4411 1767 5915 2252 4211 5598 2087 730 6045 6012 6637 6832 5011 1731 1647 6469 5560 5948 3068 2597 3961 3893 2376 1299 6637 5702 1647 1647 6469 648 2073 4480 3166 4269 340 5235 4269 5011 5051 900 7377 5677 2192 5168 5977 3750 3220 1699 5724 5271 1647 1699 2621 4480 5011 5051 3750 3335 7261 7399 2252 4211 2621 4480 5724 5271 1647 3166 4269 5011 5051 3166 5445 2252 4211 5235 4269 5011 5051 3750 4671 6846 5948 299 2597 3961 2380 23 94 1277 5011 5051 1375 5393 3800 151 4211 2073 4480 5235 4269 340 3166 4269 5011 5051 900 4399 3961 4412 2210 7039 3396 5915 6178 6248 913 7125 4721 7023 5999 4853 1702 3300 3750 5235 4269 5011 5051 648 3019 663 4269 7509 3223 151 6350 3038 1043 648 3915 2522 900 6122 5858 1567 3750 2400 4411 1375 4269 4117 2519 648 3495 4220 5011 5051 3765 3945 3750 3196 7370 7399 4269 606 648 5011 5051 648 383 6980 340 5011 6980 299 3605 669 2688 3750 4822 5677 3750 2400 4411 648 3501 3495 5139 38 4853 7399 151 3495 2400 2975 2975 2862 7436 3750 4525 5410 6122 2489 669 656 38 648 5139 4853 4269 1871 4780 4151 5139 3750 5780 7399 656 38 5598 3750 3220 7377 3605 3915 6467 6535 5998 2489 2621 5139 2112 648 3220 1919 900 5057 4411 5736 5915 2348 6017 3263 3750 1141 4411 4128 5430 5598 4464 3370 1519 4998 1891 1211 3686 6453 872 5598 6122 2489 2621 5139 6656 6609 648 1031 761 900 6535 5598 5998 2489 3220 1919 3750 5591 541 2252 2770 2400 4411 913 7509 2717 6093 116 3686 1816 5430 5235 4269 6832 5338 3750 3220 7377 2252 2770 2400 4411 913 957 5096 2539 4934 7010 3750 5915 6178 1702 5780 512 1699 600 7010 3750 25 2252 2304 4411 913 957 2087 730 4934 7010 3750 5011 5051 2541 1460 486 1735 3750 3220 7377 3915 1698 5606 3263 1697 5235 4269 6832 5338 1702 2304 648 5915 6178 7255 2210 648 7239 6015 900 646 2465 656 2109 443 4167 5410 3750 2109 3223 5011 2252 2304 4411 913 3504 1460 6831 3915 5915 6178 6832 5338 5176 2770 5235 4269 3912 7261 3750 4969 1080 2210 2621 5139 6656 6609 3750 2252 4211 5235 4269 3912 7261 5011 5051 3750 6832 5338 1250 3068 2621 4480 5724 5271 1647 5560 1394 3961 5560 6637 5702 5560 2597 3961 5560 1080 2376 4117 5560 5096 2539 5560 3523 5289 4630 4270 5560 2693 1363 5560 517 5620 6333 6663 5560 3501 3961 5560 5393 3764 5560 5296 6319 1277 5041 1903 1582 3134 7055 6122 2489 2252 2770 2400 4411 913 4128 2252 4211 5235 4269 5011 5051 3750 4811 648 4411 913 2252 4211 648 6832 5338 4269 606 5589 1567 3750 4811 648 4411 913 6740 2252 4211 2621 4480 5724 5271 1647 5560 2597 3961 5620 3374 6832 5011 1277 1903 5328 1736 3915 5915 6178 6832 5338 648 5235 4269 5011 5051 900 5491 6046 2970 659 2662 1\n",
307 | "1141 5036 922 885 7539 5296 6637 4407 6920 4811 3955 5037 1302 6289 3809 3694 7077 2119 3560 3283 3809 3560 2154 1258 3039 6822 4464 3370 2073 3659 4149 4853 134 2936 1722 1610 1722 5028 3750 6160 5296 6637 5702 7399 5264 6122 648 4516 6920 1722 7399 6887 4811 3373 7370 94 3099 2109 7029 4063 2274 648 900 4326 5510 5028 6017 3750 5998 4939 3220 1699 7399 4490 4392 6521 1903 2073 2212 2400 3750 6160 5296 6357 2614 648 5906 4148 404 192 6045 7194 6122 6405 1270 1854 7123 3744 4040 6831 6015 648 6886 4480 3750 1767 5915 4270 6093 5598 6160 5296 3456 6357 2255 5997 5322 980 648 2910 197 4525 4648 3750 5998 5057 5488 4148 7543 6930 648 4351 4480 5028 6017 6038 742 1394 4939 6122 1903 3263 5122 134 4430 3750 742 25 3750 7399 5330 2073 3937 2489 5445 1323 3750 1465 7194 507 3223 4893 1767 5915 5176 535 6729 3442 4407 5681 5598 133 2614 3750 1866 3461 23 4958 5598 3456 6357 648 1247 1460 900 4811 5504 619 7377 3750 5488 4148 7543 6930 913 1815 1699 3750 7399 6160 5296 6357 2614 2210 6040 2212 2400 3750 6040 7144 1567 3220 1919 1767 5915 669 544 1699 299 5122 62 7486 1913 2109 1641 1854 5598 900 5537 1722 299 3750 7399 4464 3370 2073 4464 1324 4853 384 2693 5935 2147 648 7346 5770 781 5470 5296 6319 6825 3915 3747 1866 7495 6122 3272 2212 2400 3750 1985 7194 1394 137 5915 6832 2376 3750 803 6810 2717 5057 5488 4148 7543 6930 648 4351 4480 3263 4392 3750 5057 6160 5296 5122 4148 2212 2400 3255 25 6713 1308 2109 2252 1151 4568 648 5445 1920 1767 5915 4490 4392 5598 900 1635 512 23 6641 4893 669 656 3870 648 4939 3750 565 742 5296 6637 1401 3272 1767 5915 3615 6630 5598 6122 1906 7160 4480 648 4002 7371 3750 3618 4939 6160 5296 5906 4148 404 192 6045 7194 648 6040 7144 3750 5780 7399 4653 4969 669 4269 4866 2073 2400 7513 5445 6017 3099 623 4819 900 4167 5410 3750 7346 5770 781 5470 5296 6319 6825 3915 3747 5612 2131 667 648 3945 3272 5906 4148 404 192 6160 5296 5096 2539 1363 134 3750 4822 486 3504 4499 4641 5096 2539 3247 7543 7160 4480 1699 1324 6065 6250 2799 7186 3750 340 4464 3370 2073 4464 1324 4853 5445 6521 2230 7129 6983 7305 5328 900 5659 7377 3166 5445 3750 1985 7194 512 23 3859 6759 3263 4659 3750 7399 4490 4392 2289 1519 1920 3750 6521 5677 6160 5296 6357 2614 648 5122 4148 1866 3461 6630 4002 5445 3750 5998 6122 5011 2007 2109 1395 4128 3605 4939 5780 7399 648 4068 486 900 6788 742 3750 4128 507 5282 6017 648 6122 4166 5445 1920 4525 5977 3750 6357 2614 299 1394 3800 1736 7495 4941 1844 6734 7159 2688 648 3744 4040 6831 6015 6040 7144 3750 1699 5296 6637 5122 4148 648 5505 1914 2138 1906 2699 1722 648 6835 6192 3750 3618 4939 7399 7377 4525 5410 3750 5488 4148 7543 6930 913 3800 2446 2968 1985 7194 3750 6160 5296 6357 2614 648 4516 6920 5555 1500 4190 2490 1854 3193 6122 4166 5445 1920 900 5491 5330 3568 192 6045 7539 6289 3809 3694 7077 2465 1934 343 3586 3477 2119 3694 3560 2662 1\n",
308 | "Name: text, Length: 199903, dtype: int64"
309 | ]
310 | },
311 | "execution_count": 29,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "trainData[\"text\"].value_counts()"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 23,
323 | "metadata": {},
324 | "outputs": [
325 | {
326 | "data": {
327 | "text/html": [
328 | "\n",
329 | "\n",
342 | "
\n",
343 | " \n",
344 | " \n",
345 | " | \n",
346 | " label | \n",
347 | " length | \n",
348 | "
\n",
349 | " \n",
350 | " \n",
351 | " \n",
352 | " | count | \n",
353 | " 200000.000000 | \n",
354 | " 200000.000000 | \n",
355 | "
\n",
356 | " \n",
357 | " | mean | \n",
358 | " 3.210950 | \n",
359 | " 907.207110 | \n",
360 | "
\n",
361 | " \n",
362 | " | std | \n",
363 | " 3.084955 | \n",
364 | " 996.029036 | \n",
365 | "
\n",
366 | " \n",
367 | " | min | \n",
368 | " 0.000000 | \n",
369 | " 2.000000 | \n",
370 | "
\n",
371 | " \n",
372 | " | 25% | \n",
373 | " 1.000000 | \n",
374 | " 374.000000 | \n",
375 | "
\n",
376 | " \n",
377 | " | 50% | \n",
378 | " 2.000000 | \n",
379 | " 676.000000 | \n",
380 | "
\n",
381 | " \n",
382 | " | 75% | \n",
383 | " 5.000000 | \n",
384 | " 1131.000000 | \n",
385 | "
\n",
386 | " \n",
387 | " | max | \n",
388 | " 13.000000 | \n",
389 | " 57921.000000 | \n",
390 | "
\n",
391 | " \n",
392 | "
\n",
393 | "
"
394 | ],
395 | "text/plain": [
396 | " label length\n",
397 | "count 200000.000000 200000.000000\n",
398 | "mean 3.210950 907.207110\n",
399 | "std 3.084955 996.029036\n",
400 | "min 0.000000 2.000000\n",
401 | "25% 1.000000 374.000000\n",
402 | "50% 2.000000 676.000000\n",
403 | "75% 5.000000 1131.000000\n",
404 | "max 13.000000 57921.000000"
405 | ]
406 | },
407 | "execution_count": 23,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "trainData.describe()"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 32,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/plain": [
424 | "'2967 6758 339 2021 1854 3731 4109 3792 4149 1519 2058 3912 2465 2410 1219 6654 7539 264 2456 4811 1292 2109 6905 5520 7058 6045 3634 6591 3530 6508 2465 7044 1519 3659 2073 3750 3731 4109 3792 6831 2614 3370 4269 3370 486 5770 4109 4125 3750 5445 2466 6831 6758 3743 3630 1726 2313 5906 826 4516 657 900 1871 7044 3750 2967 3731 1757 1939 648 2828 4704 7039 3706 3750 965 2490 7399 3743 2145 2407 7451 3775 6017 5998 1641 299 4704 2621 7029 3056 6333 433 648 1667 1099 900 2289 1099 648 5780 220 7044 1279 7426 4269 3750 2967 6758 6631 3099 2205 7305 2620 5977 3750 3329 1793 6666 2042 3193 4149 1519 7039 3706 2446 5399 648 4124 2058 3912 248 3193 2252 5649 2212 4939 7239 3310 4525 2400 900 5770 4109 4125 7044 4921 265 1397 4699 1699 669 6407 3750 1271 1271 4741 669 4659 3870 4030 4167 5338 25 3466 6909 4417 1859 3750 1465 7194 648 3938 1571 848 6986 827 2124 3750 1991 7444 7037 2729 908 6308 3750 1889 6810 4190 591 5598 2289 2109 6831 6407 2400 5410 517 900 25 3731 4109 3792 4128 1679 4811 4853 4109 3630 6902 6122 1903 1736 3915 2602 6822 3750 6630 4265 591 729 4448 648 1465 1401 4853 648 5881 6182 4128 1679 4939 2646 652 340 7328 1320 900 1460 619 5505 2376 4853 3272 3750 4853 4109 3630 6902 3362 2810 3750 803 1985 4128 669 19 6508 900 1635 1871 7377 6122 6017 3750 2289 1099 3938 1571 7509 1375 5393 5589 5037 2115 4707 5310 6811 6093 900 7399 2410 1219 6654 3263 6017 3750 5998 4939 5971 4148 3750 803 1985 7194 4780 796 6038 4231 648 1722 6407 3750 1099 6485 1920 1767 5915 6518 6093 5598 5648 4280 900 7326 6242 5328 1214 3870 1985 7194 5998 5741 2115 913 5950 3800 1538 686 6734 6017 3750 1985 3659 1324 5814 4998 5176 535 7399 307 4068 486 1667 1099 2121 6407 3750 7420 3099 6038 4231 4190 1519 3255 7123 4305 3231 1635 4822 1722 3750 2967 3731 1757 1939 648 473 6518 2400 2614 5330 5530 1394 4939 1903 7495 7239 900 4469 5530 4704 299 7467 2121 669 5693 3750 3618 299 5264 4853 1734 316 2828 5445 4190 4939 3484 6043 2376 1031 761 900 5370 3782 2210 669 2210 3099 1363 6301 3508 1907 2410 7509 5718 541 3750 803 2967 6758 3038 6641 1985 7194 512 4811 6811 5243 2112 3750 1734 2376 2891 1211 648 7257 4148 7159 1667 3750 5816 4202 2400 5864 3915 7399 3414 1667 5977 7327 7256 2935 4936 1667 2151 900 6831 4599 6182 3227 3859 3099 7509 7256 3750 1985 7194 4128 4691 2029 1344 6630 5598 1465 648 3706 7403 543 3038 900 1985 7194 3800 980 6017 980 4124 648 900 1635 3605 5028 3731 4109 3792 1866 3578 3915 648 4939 1335 6666 6560 3750 3618 3508 1907 2410 1913 6656 3750 2828 4704 4998 4939 7039 3915 4167 5338 3750 803 1985 4939 3263 7123 264 2456 5689 2109 648 3750 6093 1699 5589 4411 1866 4750 648 1667 1099 3000 7420 1279 2975 1141 7148 3750 1985 3915 2570 4936 5998 1877 3000 7420 900 1635 5470 2313 5864 641 4333 3750 3915 5659 316 2828 2770 5176 803 2047 7532 606 6980 1635 3750 803 1750 7039 3800 7245 3099 7509 5839 3750 1866 1401 4321 5788 1519 6122 6405 4939 5998 2729 900 1985 7194 5998 2289 2107 1519 1592 316 2828 1679 4811 5461 3324 4525 4052 3750 2212 742 3750 1985 7194 6631 1335 5445 3750 1465 7194 4128 6887 4819 5977 3223 2717 900 5612 5948 3750 1985 7194 2289 913 3800 4811 6122 2614 2047 7532 606 6980 900 1985 2541 4409 3772 6012 1833 5560 4173 6662 414 340 316 4125 4128 3800 669 6575 4819 5977 900 1635 25 1460 619 7044 4921 648 4407 3800 1241 600 3750 5470 2313 641 4333 7539 803 316 4125 648 3530 6637 569 1985 3000 4659 5610 6917 3750 3618 1985 6887 7010 3870 900 3915 4939 7010 3870 5598 1985 1394 3397 5598 900 1635 1460 619 5708 1335 6518 4148 3750 2410 1219 6654 2252 1702 5598 803 4646 2109 6905 5520 1635 2663 885 5491 1465 4822 1722 5011 2376 4149 1903 2662 3750 803 316 2828 1767 5915 6065 2042 1335 5598 3750 2688 5598 3231 5780 7399 3750 4811 5788 1292 1641 1667 1099 4811 5393 6407 5708 6631 1335 6666 900 316 4125 4811 648 4939 6678 3750 2021 1726 340 4469 4842 4128 669 5393 4801 3154 3750 5780 7399 669 3915 544 62 5602 1913 5598 3750 3859 6759 4939 4646 1913 900 1635 1767 5915 6065 4464 5814 648 2410 1219 6654 1815 1699 6038 4231 5698 1375 62 307 3750 803 299 5264 1460 316 2828 5445 3750 1985 3414 1667 7509 3223 3750 5998 4939 669 2364 2975 648 900 1985 3038 5938 5168 3770 1667 3750 2717 368 5693 7117 3750 1985 2131 6909 2192 1141 6831 6015 900 3864 7194 1375 5393 1815 1699 1985 5780 7399 5681 3099 5176 3870 5598 3750 1985 3038 3771 6630 7159 1667 900 1635 5659 7377 3166 5445 3750 1793 6666 648 2614 5736 5537 5526 4128 6887 4811 495 6386 900 1465 7194 1767 5659 2410 1219 6654 340 1362 1829 2304 3193 6822 3750 5330 5264 4321 3750 4173 5619 4109 6227 648 5915 6515 4893 5957 6043 3750 5949 4411 5410 1991 4128 826 2490 3193 2602 3750 803 1985 7194 4516 5264 1394 3800 5659 3731 4109 3792 5081 2918 3750 5051 1985 5612 19 3750 3731 4109 3792 5718 7239 3193 6822 900 1635 7377 5736 3750 2205 7305 2620 2042 5192 1745 3605 6887 5278 299 648 5651 7440 1656 3630 1702 3300 7539 803 1985 340 3731 4109 3792 4190 4811 4464 1519 5778 3166 3750 1985 3038 6235 7399 5998 2313 900 1635 25 910 619 4939 1613 248 3193 4741 4893 3750 2967 3731 1757 1939 648 7495 5028 5949 4939 7539 803 4811 2255 3915 3750 1394 4741 900 6887 2255 3915 3750 1394 669 4741 900 1635'"
425 | ]
426 | },
427 | "execution_count": 32,
428 | "metadata": {},
429 | "output_type": "execute_result"
430 | }
431 | ],
432 | "source": [
433 | "trainData.loc[0, \"text\"]"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 36,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "# 构建字频表\n",
443 | "char2tf={}\n",
444 | "for i in range(trainData.shape[0]):\n",
445 | " t=trainData.iloc[i][\"text\"].split()\n",
446 | " for k in t:\n",
447 | " if k not in char2tf.keys():\n",
448 | " char2tf[k] = 1\n",
449 | " else:\n",
450 | " char2tf[k] += 1"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": 37,
456 | "metadata": {},
457 | "outputs": [
458 | {
459 | "data": {
460 | "text/plain": [
461 | "7482224"
462 | ]
463 | },
464 | "execution_count": 37,
465 | "metadata": {},
466 | "output_type": "execute_result"
467 | }
468 | ],
469 | "source": [
470 | "max(char2tf.values())"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 39,
476 | "metadata": {},
477 | "outputs": [
478 | {
479 | "data": {
480 | "text/plain": [
481 | "1"
482 | ]
483 | },
484 | "execution_count": 39,
485 | "metadata": {},
486 | "output_type": "execute_result"
487 | }
488 | ],
489 | "source": [
490 | "min(char2tf.values())"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 38,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "data": {
500 | "text/plain": [
501 | "6869"
502 | ]
503 | },
504 | "execution_count": 38,
505 | "metadata": {},
506 | "output_type": "execute_result"
507 | }
508 | ],
509 | "source": [
510 | "len(char2tf)"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 41,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "name": "stdout",
520 | "output_type": "stream",
521 | "text": [
522 | "108 OOV chars in test data\n"
523 | ]
524 | }
525 | ],
526 | "source": [
527 | "# 测试集\n",
528 | "testchar2tf={}\n",
529 | "for i in range(testAData.shape[0]):\n",
530 | " t=testAData.iloc[i][\"text\"].split()\n",
531 | " for k in t:\n",
532 | " if k not in char2tf.keys():\n",
533 | " if k not in testchar2tf.keys():\n",
534 | " testchar2tf[k] = 1\n",
535 | " else:\n",
536 | " testchar2tf[k] += 1\n",
537 | "print(\"{} OOV chars in test data\".format(len(testchar2tf)))"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 42,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "name": "stdout",
547 | "output_type": "stream",
548 | "text": [
549 | "11 1\n"
550 | ]
551 | }
552 | ],
553 | "source": [
554 | "print(max(testchar2tf.values()), min(testchar2tf.values()))"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 45,
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "data": {
564 | "text/plain": [
565 | "dict_keys(['2967', '6758', '339', '2021', '1854', '3731', '4109', '3792', '4149', '1519', '2058', '3912', '2465', '2410', '1219', '6654', '7539', '264', '2456', '4811', '1292', '2109', '6905', '5520', '7058', '6045', '3634', '6591', '3530', '6508', '7044', '3659', '2073', '3750', '6831', '2614', '3370', '4269', '486', '5770', '4125', '5445', '2466', '3743', '3630', '1726', '2313', '5906', '826', '4516', '657', '900', '1871', '1757', '1939', '648', '2828', '4704', '7039', '3706', '965', '2490', '7399', '2145', '2407', '7451', '3775', '6017', '5998', '1641', '299', '2621', '7029', '3056', '6333', '433', '1667', '1099', '2289', '5780', '220', '1279', '7426', '6631', '3099', '2205', '7305', '2620', '5977', '3329', '1793', '6666', '2042', '3193', '2446', '5399', '4124', '248', '2252', '5649', '2212', '4939', '7239', '3310', '4525', '2400', '4921', '265', '1397', '4699', '1699', '669', '6407', '1271', '4741', '4659', '3870', '4030', '4167', '5338', '25', '3466', '6909', '4417', '1859', '1465', '7194', '3938', '1571', '848', '6986', '827', '2124', '1991', '7444', '7037', '2729', '908', '6308', '1889', '6810', '4190', '591', '5598', '5410', '517', '4128', '1679', '4853', '6902', '6122', '1903', '1736', '3915', '2602', '6822', '6630', '4265', '729', '4448', '1401', '5881', '6182', '2646', '652', '340', '7328', '1320', '1460', '619', '5505', '2376', '3272', '3362', '2810', '803', '1985', '19', '1635', '7377', '7509', '1375', '5393', '5589', '5037', '2115', '4707', '5310', '6811', '6093', '3263', '5971', '4148', '4780', '796', '6038', '4231', '1722', '6485', '1920', '1767', '5915', '6518', '5648', '4280', '7326', '6242', '5328', '1214', '5741', '913', '5950', '3800', '1538', '686', '6734', '1324', '5814', '4998', '5176', '535', '307', '4068', '2121', '7420', '3255', '7123', '4305', '3231', '4822', '473', '5330', '5530', '1394', '7495', '4469', '7467', '5693', '3618', '5264', '1734', '316', '3484', '6043', '1031', '761', '5370', '3782', '2210', '1363', '6301', '3508', '1907', '5718', '541', '3038', '6641', '512', '5243', '2112', '2891', '1211', '7257', '7159', '5816', '4202', '5864', '3414', '7327', '7256', '2935', '4936', '2151', '4599', '3227', '3859', '4691', '2029', '1344', '7403', '543', '980', '3605', '5028', '1866', '3578', '1335', '6560', '1913', '6656', '5689', '4411', '4750', '3000', '2975', '1141', '7148', '2570', '1877', '5470', '641', '4333', '5659', '2770', '2047', '7532', '606', '6980', '1750', '7245', '5839', '4321', '5788', '6405', '2107', '1592', '5461', '3324', '4052', '742', '6887', '4819', '3223', '2717', '5612', '5948', '2541', '4409', '3772', '6012', '1833', '5560', '4173', '6662', '414', '6575', '4407', '1241', '600', '6637', '569', '5610', '6917', '7010', '3397', '5708', '1702', '4646', '2663', '885', '5491', '5011', '2662', '6065', '2688', '6678', '4842', '4801', '3154', '544', '62', '5602', '6759', '4464', '1815', '5698', '2364', '5938', '5168', '3770', '368', '7117', '2131', '2192', '6015', '3864', '5681', '3771', '3166', '5736', '5537', '5526', '495', '6386', '1362', '1829', '2304', '5619', '6227', '6515', '4893', '5957', '5949', '5081', '2918', '5051', '5192', '1745', '5278', '5651', '7440', '1656', '3300', '5778', '6235', '910', '1613', '2255', '6352', '4802', '1452', '3137', '26', '6663', '2986', '1746', '3002', '4603', '3220', '1567', '1407', '4403', '2540', '2329', '3762', '6713', '3247', '7543', '656', '1289', '5769', '2556', '5472', '6453', '4350', '7469', '816', '4914', '418', '5542', '1386', '3686', '2741', '6733', '2515', '4796', '314', '3007', '1769', '2415', '5810', '5928', '1308', '4480', '6770', '6469', '1647', '6293', '450', '4558', '2693', '216', '4655', '7255', '6395', '2422', '350', '1057', '5482', '45', '1158', '5105', '6919', '4462', '53', '7045', '6050', '2402', '192', '5677', '5041', '478', '868', '3765', '4628', '1924', '23', '5787', '5139', '4758', '1154', '3945', '6928', '6875', '2522', '1816', '6929', '1060', '5858', '5695', '289', '1737', '7059', '4969', '4440', '6104', '5510', '1277', '4063', '3700', '5106', '3481', '7147', '5153', '5397', '2842', '3257', '4985', '7346', '5074', '3747', '1777', '2226', '7354', '6088', '290', '6350', '1580', '134', '1245', '3952', '2049', '5620', '6569', '2233', '1260', '3189', '2667', '7349', '141', '5430', '1514', '5658', '4106', '6768', '3167', '2282', '69', '1587', '6046', '2549', '4909', '7421', '5525', '722', '7219', '5501', '4547', '659', '5605', '893', '3472', '4643', '7381', '2827', '1564', '4036', '5999', '138', '6722', '5511', '7523', '4498', '5801', '4211', '7477', '4203', '4490', '4351', '4261', '2434', '133', '6543', '4559', '383', '4040', '751', '5099', '1205', '2595', '2221', '4718', '5296', '752', '2862', '4458', '137', '920', '465', '6600', '1036', '4430', '3049', '5604', '3545', '3695', '3691', '1698', '2791', '926', '3088', '1999', '6714', '5822', '4392', '3342', '4232', '4967', '6651', '1252', '1078', '642', '7025', '1325', '5889', '6962', '4386', '5273', '2505', '5380', '2331', '2274', '2612', '5594', '144', '4677', '6940', '7212', '607', '7125', '1805', '4650', '6325', '3226', '3966', '4097', '4329', '6521', '2230', '7366', '4298', '5640', '7370', '1952', '6583', '4080', '5791', '3107', '531', '7492', '1629', '7055', '6846', '805', '794', '2732', '5165', '4510', '978', '5151', '3196', '6833', '6966', '4053', '5984', '2749', '2461', '7313', '4866', '4287', '1395', '2396', '4289', '1906', '6040', '5955', '5450', '3103', '1283', '2489', '5566', '1247', '5538', '5917', '2495', '1267', '2435', '1168', '3725', '4046', '445', '57', '948', '211', '3956', '5492', '7006', '6983', '6781', '7394', '296', '3764', '2899', '922', '3651', '2923', '5728', '6798', '4291', '6037', '1951', '1070', '2265', '3433', '5678', '2983', '355', '2799', '7154', '872', '4355', '7373', '5292', '6644', '3203', '5617', '6417', '212', '3500', '1744', '5581', '1582', '2592', '734', '3780', '7033', '6567', '3949', '203', '88', '3117', '5702', '4220', '3495', '408', '4214', '571', '5929', '6740', '5920', '7047', '4981', '7507', '5886', '4130', '3568', '6587', '1264', '3613', '671', '5282', '3506', '5850', '3819', '7311', '1080', '5019', '1103', '1936', '4744', '3585', '2211', '536', '1715', '58', '4413', '932', '1914', '974', '5333', '6996', '6551', '2795', '3062', '2023', '1970', '4636', '902', '2444', '7465', '2218', '3212', '5623', '7078', '1215', '6977', '3971', '3070', '2334', '5026', '3860', '1963', '5284', '4450', '3961', '5793', '2127', '5056', '3606', '7251', '5823', '5177', '2267', '4653', '5775', '4505', '3943', '979', '1885', '2491', '6981', '4105', '3233', '6507', '790', '116', '5883', '502', '64', '6193', '1379', '5269', '2354', '4183', '6549', '532', '4993', '304', '3155', '3418', '1855', '623', '2483', '6609', '4486', '4626', '1610', '4482', '955', '578', '3469', '5005', '1695', '3440', '5536', '281', '810', '343', '3809', '1066', '3646', '3055', '4876', '1043', '3994', '7393', '7378', '2674', '4180', '4301', '5776', '3694', '2119', '4216', '3285', '6314', '4553', '462', '4399', '6101', '1633', '5057', '13', '7330', '1351', '1779', '7261', '7408', '7157', '6286', '4166', '4442', '5909', '7361', '2768', '5274', '4582', '1697', '1457', '5396', '1323', '4778', '2586', '3987', '1129', '6725', '1006', '6542', '1844', '2786', '5271', '1018', '847', '3560', '219', '150', '5122', '1899', '6003', '869', '4958', '628', '993', '3317', '6389', '1302', '2936', '4380', '2348', '6250', '1388', '4986', '7186', '3456', '6881', '2990', '4542', '2380', '2847', '1146', '5385', '3661', '6444', '3106', '1132', '2147', '2136', '5205', '100', '7520', '6751', '1605', '5190', '1176', '2626', '5235', '1220', '6832', '3128', '5660', '3327', '1693', '2848', '580', '3034', '2857', '6163', '6107', '2421', '930', '180', '2477', '4731', '6501', '6640', '7111', '6267', '5298', '7191', '43', '6681', '945', '94', '2727', '2087', '5402', '4760', '2506', '3080', '7449', '1946', '1295', '2683', '4017', '2022', '5547', '1916', '736', '4583', '7137', '5486', '3740', '5389', '4928', '6393', '3643', '1588', '4223', '6790', '1258', '3283', '3586', '4902', '6289', '2154', '7127', '1731', '6284', '7091', '1684', '5226', '4089', '4412', '2970', '3396', '2648', '7207', '3845', '6160', '5308', '4270', '3523', '4139', '341', '3242', '4499', '3627', '3766', '6973', '4612', '6220', '4576', '4933', '2647', '3335', '2685', '1227', '2538', '4151', '3374', '3893', '2378', '4568', '3366', '2539', '1569', '3209', '6357', '1152', '2610', '6176', '843', '550', '3129', '764', '1960', '5860', '4941', '3479', '6968', '6899', '5474', '6596', '2439', '5095', '2508', '7309', '3745', '4262', '852', '760', '1348', '1334', '7160', '3090', '3811', '6087', '6355', '4114', '3067', '4326', '6248', '5178', '1814', '6004', '5179', '4181', '4924', '2328', '4293', '2597', '3068', '426', '452', '3438', '4144', '3018', '2873', '2367', '3158', '2974', '2638', '4302', '2028', '4562', '5096', '6892', '4590', '1061', '4369', '4396', '485', '4117', '5724', '1934', '2471', '3529', '5882', '4374', '5063', '3641', '1116', '6078', '5166', '1511', '743', '938', '5854', '507', '2708', '886', '3019', '5918', '405', '3130', '4641', '7486', '7363', '6930', '1688', '3654', '29', '1474', '7541', '7387', '2716', '5416', '440', '3309', '3703', '4031', '6143', '4917', '2730', '5803', '3515', '1623', '5436', '4237', '7528', '903', '5899', '4021', '151', '293', '4120', '318', '1121', '3744', '5003', '4504', '470', '5744', '3644', '5217', '4233', '6729', '2762', '3937', '4751', '2555', '197', '983', '4630', '2448', '750', '4118', '4372', '349', '740', '6254', '2196', '4078', '6206', '6063', '7371', '7169', '2776', '3531', '2316', '1973', '2004', '1735', '415', '3824', '2076', '6535', '2835', '5964', '330', '2197', '6283', '6695', '1294', '6888', '2499', '3012', '3724', '6760', '5562', '2315', '36', '4971', '1953', '6835', '6192', '4145', '5723', '3461', '3813', '4544', '2379', '3886', '420', '7142', '7122', '5289', '5305', '7453', '3595', '6675', '3901', '2393', '1327', '4230', '2484', '6602', '5694', '127', '4381', '5963', '2035', '4531', '6565', '498', '1706', '442', '2397', '4163', '225', '3170', '6149', '5413', '6375', '2859', '2702', '2429', '1842', '6165', '2169', '3533', '1778', '5573', '5705', '5935', '6985', '4721', '3289', '5036', '4543', '3017', '7008', '936', '2106', '3976', '6014', '3598', '5229', '6890', '4366', '2838', '1919', '3477', '3392', '7206', '6908', '565', '632', '2399', '3577', '3407', '6639', '1255', '986', '7075', '404', '1965', '443', '5497', '4340', '3692', '3620', '1640', '101', '1014', '5381', '4976', '7475', '4205', '2480', '5926', '1590', '2604', '1940', '5988', '5644', '6992', '247', '6319', '2851', '631', '1299', '5936', '3442', '2642', '419', '1987', '2356', '3120', '3501', '4003', '4587', '4923', '6390', '7436', '6689', '5519', '1730', '670', '5466', '6482', '5968', '6920', '5288', '1515', '4683', '2351', '5574', '5647', '6956', '2910', '7452', '7447', '3665', '5247', '7180', '3955', '5006', '5297', '998', '1042', '654', '2728', '245', '3039', '1402', '1389', '6497', '7013', '4961', '3778', '746', '2189', '3608', '6588', '2214', '5663', '18', '5901', '677', '1419', '7490', '5203', '2463', '730', '7395', '768', '425', '4671', '3270', '1620', '5335', '4611', '5184', '244', '3512', '6441', '2420', '2056', '781', '79', '4987', '2998', '3607', '7458', '3134', '3032', '5493', '167', '136', '1536', '7400', '3364', '2229', '5251', '5089', '1550', '6414', '6272', '499', '5521', '6861', '239', '5239', '3818', '6027', '1508', '232', '5799', '14', '6886', '3036', '3030', '584', '7250', '2099', '4690', '5391', '1315', '6178', '960', '4042', '895', '3104', '16', '5970', '6094', '4454', '83', '4709', '2722', '5498', '3248', '5442', '4274', '2699', '957', '3065', '3373', '4509', '3172', '6924', '5208', '566', '3020', '3132', '7104', '5017', '6854', '6834', '6360', '2893', '4934', '2519', '5411', '1976', '7110', '3051', '7531', '1270', '2680', '2558', '3415', '6111', '394', '1879', '889', '7023', '5258', '693', '6324', '2054', '505', '5117', '4786', '5746', '6972', '4719', '3504', '4560', '6007', '2279', '710', '1832', '4680', '3198', '3528', '2260', '1110', '1964', '4275', '6221', '811', '4315', '1743', '6894', '5811', '7344', '5459', '3168', '2787', '4766', '1819', '7335', '1622', '6447', '6119', '4094', '6753', '2120', '6423', '4627', '2988', '2273', '5144', '6487', '2581', '6424', '182', '5221', '2453', '5316', '3066', '54', '1615', '149', '4567', '3318', '6223', '3482', '5426', '6864', '5368', '6842', '2437', '6346', '4858', '4686', '6318', '1724', '3868', '6127', '2524', '7272', '3650', '4218', '2695', '3343', '5202', '3464', '856', '3855', '3420', '38', '5395', '1504', '3294', '2993', '1616', '4768', '1008', '6079', '950', '5625', '6613', '2070', '5638', '5555', '5000', '4831', '5361', '337', '7042', '1995', '3689', '354', '7144', '2867', '2655', '4732', '634', '4022', '2511', '78', '3360', '1248', '5085', '2167', '2530', '5688', '6471', '5504', '2024', '4779', '4402', '5480', '6615', '3873', '4906', '6625', '5109', '7513', '4861', '5683', '1459', '1405', '7019', '2582', '3648', '4536', '3615', '6552', '1274', '2756', '1370', '5869', '3377', '4679', '4595', '1148', '4199', '3182', '1718', '4728', '1670', '5322', '366', '5513', '5532', '4375', '4429', '5739', '6720', '4188', '3729', '2005', '482', '5367', '6115', '4614', '2691', '6976', '1902', '6398', '1164', '2596', '1865', '6738', '1425', '3443', '560', '431', '1160', '5422', '832', '4491', '1206', '5684', '4461', '5628', '6799', '763', '6066', '1390', '663', '3299', '4065', '3110', '5129', '2866', '2931', '4784', '2832', '5467', '2098', '1306', '3899', '1314', '3159', '2080', '873', '2576', '5169', '5064', '4720', '4002', '1827', '2248', '2487', '4695', '2425', '7015', '7417', '6898', '7032', '1727', '157', '4829', '6965', '496', '2769', '3015', '2734', '3436', '3266', '6862', '7077', '2528', '7533', '6674', '2987', '1846', '1972', '1890', '5686', '2007', '7329', '1170', '4018', '854', '5234', '3850', '1868', '169', '6404', '4377', '4555', '1166', '6158', '4460', '6500', '3503', '317', '6011', '7317', '3312', '5200', '1782', '583', '84', '859', '3368', '2181', '3610', '3563', '3914', '4354', '5484', '3267', '159', '292', '6971', '1480', '7098', '4285', '518', '3073', '2656', '7512', '4775', '7336', '7423', '2968', '3720', '5974', '3054', '1122', '2525', '489', '1242', '5111', '7540', '4122', '7292', '5033', '2011', '2785', '4836', '6256', '2318', '5960', '2669', '4693', '128', '56', '7038', '6621', '6431', '5228', '3093', '98', '4150', '5940', '5004', '6298', '7238', '2411', '2984', '2745', '5277', '660', '2283', '1352', '5369', '817', '2105', '5495', '6236', '4894', '3354', '5997', '3336', '3084', '2738', '1660', '2713', '1184', '3489', '5870', '1100', '1657', '6362', '5631', '7375', '894', '6658', '2818', '6234', '2504', '1753', '4506', '5873', '6994', '1151', '777', '1055', '2771', '1618', '5468', '5194', '2473', '4851', '3910', '5986', '7051', '463', '5621', '554', '5603', '1076', '2198', '3147', '5600', '6959', '3112', '3988', '6748', '6287', '5125', '1171', '1758', '6764', '1510', '1500', '4949', '2594', '1287', '3784', '1891', '4157', '5295', '812', '6252', '3681', '4743', '2942', '6265', '4832', '1690', '5636', '3793', '3215', '2321', '699', '5514', '3148', '456', '4177', '4473', '2919', '4245', '7134', '2369', '3749', '5762', '1073', '5214', '7254', '1361', '6296', '2938', '4988', '5931', '7129', '7479', '4176', '7234', '1304', '3151', '1087', '5872', '1391', '6338', '4974', '5423', '3181', '3517', '853', '1941', '7360', '5753', '5614', '788', '7150', '5374', '6450', '3897', '7548', '5673', '3894', '674', '2976', '881', '2766', '1002', '2140', '7192', '4436', '1081', '4951', '1266', '6504', '2924', '176', '3273', '399', '5922', '2739', '2438', '5540', '6562', '3742', '6167', '384', '1518', '6133', '4597', '645', '530', '3021', '5137', '1107', '6690', '5082', '2345', '6047', '4742', '1725', '4260', '4006', '7310', '1022', '4624', '592', '4334', '1839', '3161', '5847', '2164', '802', '7265', '4250', '3097', '3326', '3795', '1232', '5340', '3083', '2307', '6054', '6243', '5512', '5160', '6356', '6337', '1086', '6574', '7028', '1853', '5535', '2814', '3186', '204', '715', '6141', '4323', '3603', '297', '3319', '7203', '1719', '851', '3111', '3890', '1606', '778', '2234', '7525', '5804', '7156', '4070', '3297', '6203', '2914', '5477', '2458', '2566', '6682', '6922', '5690', '5127', '1028', '1174', '1858', '1396', '3905', '4534', '3554', '7501', '6174', '4895', '1956', '6730', '6420', '3042', '4837', '6878', '472', '572', '3576', '5862', '3383', '2709', '1775', '4781', '651', '5517', '1931', '672', '6702', '5941', '4927', '7294', '4378', '1023', '6055', '1892', '7197', '177', '6906', '3287', '4153', '6914', '4715', '7121', '4662', '2247', '4835', '5156', '2999', '3400', '6857', '1870', '1112', '1250', '7546', '5052', '4276', '4191', '2138', '7445', '6142', '4648', '2603', '4874', '7214', '3349', '2030', '3192', '3746', '4313', '2350', '1109', '6830', '1200', '6780', '338', '119', '1818', '4873', '7496', '7515', '7491', '1050', '515', '1579', '266', '2358', '3521', '4431', '6139', '5792', '6474', '6522', '226', '2130', '6754', '7380', '7295', '7362', '1545', '5371', '4512', '5315', '5254', '523', '637', '3271', '7419', '5347', '3705', '3351', '4007', '891', '1268', '6311', '3717', '3096', '7404', '6694', '3951', '2572', '6819', '2879', '6327', '4082', '4225', '3328', '4337', '753', '1516', '6632', '2561', '1565', '732', '3732', '6396', '5933', '2170', '1038', '5076', '6750', '7016', '4623', '1243', '5821', '4989', '6680', '6788', '7213', '3359', '667', '5133', '7448', '413', '5634', '2423', '5149', '2775', '2172', '688', '5824', '2363', '1068', '5785', '1532', '382', '2243', '5136', '2370', '5731', '6366', '4038', '3783', '469', '4478', '4912', '5438', '1760', '2046', '5829', '1631', '4930', '4152', '4119', '879', '2091', '3878', '5773', '7152', '6829', '6493', '2841', '3157', '5529', '4841', '2032', '6671', '7498', '1126', '6511', '1471', '7542', '5118', '3990', '2209', '3290', '1958', '4090', '6164', '3474', '5287', '4675', '3113', '3345', '1812', '5475', '60', '6427', '4996', '1901', '4061', '4324', '7483', '4872', '3430', '2394', '4123', '6683', '3040', '3308', '5518', '1411', '3836', '2064', '7225', '6870', '7196', '1556', '1599', '1780', '156', '4723', '1905', '5488', '3053', '6704', '2682', '4108', '6189', '4155', '2069', '6796', '1835', '4267', '2017', '5572', '844', '6326', '4663', '1900', '3152', '6989', '285', '5121', '6472', '2568', '6057', '2502', '970', '4929', '3977', '5876', '2068', '2820', '7000', '1981', '6016', '7069', '5502', '4310', '3679', '1732', '6217', '240', '227', '5613', '6889', '6946', '5979', '2391', '81', '1938', '3941', '2901', '2012', '7544', '5656', '3213', '1127', '2639', '3108', '5047', '2406', '5087', '951', '4745', '5206', '1966', '1542', '7545', '2897', '1783', '6467', '5709', '5406', '5777', '7164', '3246', '3004', '3095', '2190', '3121', '2019', '5034', '2431', '5050', '3238', '6633', '519', '7510', '2567', '5494', '6369', '4353', '5711', '5088', '2563', '378', '17', '4975', '7224', '7062', '1010', '4592', '6490', '2512', '5624', '6303', '7046', '5591', '6151', '234', '2223', '4500', '1915', '3455', '3998', '3626', '252', '3173', '3580', '3923', '4757', '5344', '208', '1792', '1917', '5383', '1840', '4439', '770', '6749', '3934', '1638', '194', '4848', '943', '7096', '6108', '7299', '4584', '1235', '6557', '4970', '2401', '2275', '5293', '6307', '5691', '2330', '5134', '1509', '7493', '1478', '1604', '5281', '3207', '3385', '2067', '4335', '2474', '5242', '4249', '4937', '7410', '1721', '107', '3451', '3144', '782', '1341', '7088', '3982', '4077', '1950', '457', '1191', '5317', '3844', '2060', '7012', '3483', '4995', '1883', '5379', '2673', '733', '6949', '6812', '4700', '6058', '1424', '6391', '2055', '6538', '4356', '3444', '158', '1453', '2374', '6445', '3986', '5861', '5215', '4748', '2514', '6855', '4578', '5233', '4514', '1195', '2972', '112', '1045', '1090', '4736', '4234', '3243', '3190', '253', '1595', '1399', '4005', '6897', '6059', '1307', '3127', '6947', '3562', '2863', '6852', '6664', '3044', '758', '1373', '6916', '7188', '4712', '7223', '1603', '2290', '7131', '6238', '6649', '2908', '347', '610', '459', '6510', '6295', '2521', '5715', '2526', '1343', '5420', '5868', '2772', '7183', '5311', '6825', '1111', '1787', '7332', '3978', '332', '3871', '4034', '7422', '1398', '2632', '1888', '757', '3320', '5714', '4782', '7459', '5956', '5884', '6815', '7175', '7241', '679', '2337', '611', '2178', '6800', '6818', '2185', '4777', '3253', '3219', '5894', '361', '2156', '5533', '6817', '5606', '3150', '6229', '6083', '6874', '2231', '4278', '2428', '6262', '3848', '1065', '1350', '2327', '4983', '4169', '8', '5552', '3046', '5107', '2721', '7527', '5902', '676', '7106', '3617', '312', '3079', '6134', '300', '3544', '1749', '711', '327', '3662', '6144', '6582', '7001', '1196', '697', '5213', '449', '4814', '3727', '1671', '850', '6570', '557', '3718', '956', '3716', '6762', '6635', '830', '1502', '6175', '1678', '7391', '646', '564', '3781', '7086', '6610', '6868', '4968', '6491', '6479', '513', '5716', '6802', '3206', '6529', '7341', '6084', '2876', '3918', '3960', '3924', '115', '6882', '7383', '2272', '1367', '7300', '1440', '6430', '165', '5189', '1449', '1977', '7124', '3842', '7097', '6364', '6387', '4767', '2874', '2898', '1431', '5969', '2564', '6895', '4776', '5959', '1733', '4762', '3058', '7499', '3262', '218', '6845', '5687', '2644', '2965', '791', '2562', '152', '6064', '2472', '5092', '181', '4476', '5805', '4304', '5796', '5668', '6067', '5142', '6380', '1838', '5348', '250', '6840', '5360', '4908', '5578', '238', '4444', '411', '277', '3023', '3177', '1072', '5641', '1959', '612', '3059', '3153', '1089', '1492', '4870', '4408', '4539', '4497', '4194', '1297', '2710', '3999', '397', '4545', '5954', '1572', '3052', '3228', '6091', '5496', '7050', '5341', '558', '6512', '4363', '5952', '4258', '4479', '4960', '4878', '4871', '5834', '7529', '6135', '228', '6893', '1910', '1326', '3829', '7333', '6636', '5635', '5398', '7163', '4475', '7339', '3917', '230', '4922', '6402', '5632', '3925', '5874', '3711', '4657', '187', '7353', '5903', '3325', '3225', '2985', '1435', '2008', '577', '3657', '3487', '6347', '1189', '3825', '2479', '6718', '429', '254', '5123', '5182', '373', '3416', '1544', '1423', '3631', '5186', '2529', '3005', '2335', '1978', '6197', '2714', '1986', '400', '1339', '533', '3268', '3758', '7222', '7350', '4652', '4556', '3447', '6020', '698', '3851', '939', '7017', '403', '5892', '6233', '6804', '3668', '1619', '5607', '6571', '3126', '500', '2346', '7519', '4575', '6161', '1669', '4466', '2945', '2811', '2079', '2761', '7462', '2266', '1364', '6032', '2202', '1251', '5237', '185', '5697', '739', '326', '7067', '7036', '2496', '6611', '6757', '1128', '1849', '3141', '2188', '6304', '967', '4015', '561', '7210', '3929', '907', '5431', '7103', '2840', '946', '7208', '5815', '1092', '4862', '1645', '5961', '6847', '5907', '4081', '1265', '4257', '878', '4794', '4066', '268', '7414', '367', '372', '3344', '4729', '2220', '4357', '4887', '4651', '7514', '5042', '6900', '4846', '2831', '490', '2812', '2341', '6517', '2310', '3536', '4911', '5887', '2139', '1347', '2979', '7168', '529', '1497', '4059', '2174', '6477', '789', '4012', '4790', '4956', '7084', '3280', '3856', '6166', '3146', '5951', '4640', '4658', '6807', '483', '2531', '3313', '5937', '1696', '7031', '5306', '315', '4722', '195', '6000', '581', '6257', '6513', '901', '5732', '806', '5071', '1273', '1291', '3921', '6171', '125', '2523', '5255', '320', '5382', '1365', '644', '7464', '2366', '1639', '5595', '5469', '5972', '5900', '4405', '2249', '1791', '1328', '5819', '1278', '5650', '4806', '4515', '4423', '3622', '3542', '2852', '3323', '4182', '5212', '962', '1609', '377', '4175', '1529', '2686', '5001', '6660', '4793', '3946', '3425', '6199', '6195', '1811', '6191', '7061', '438', '5674', '981', '6181', '6821', '1755', '4761', '1199', '5618', '430', '6036', '838', '1083', '1843', '6454', '7190', '6941', '1540', '5162', '1173', '1360', '927', '6769', '2796', '2299', '1700', '5719', '3805', '964', '171', '590', '1178', '1764', '3957', '4889', '4358', '3614', '4834', '6194', '162', '2144', '3304', '2616', '5720', '61', '2640', '2510', '4864', '2705', '5575', '785', '5120', '2778', '604', '3967', '5286', '5596', '1741', '1044', '7505', '2293', '7534', '447', '1165', '1400', '3457', '5352', '6128', '2937', '3907', '3807', '2740', '2236', '6473', '5417', '2804', '3125', '5429', '198', '5424', '745', '3565', '988', '3391', '5257', '7454', '6109', '2044', '3763', '3690', '6553', '2724', '510', '3232', '2911', '2995', '2631', '2925', '455', '3983', '3200', '5020', '1880', '1124', '4852', '1566', '3428', '4763', '4314', '1221', '7537', '5261', '4717', '3821', '1140', '5797', '5392', '5779', '3435', '4390', '958', '5059', '7181', '35', '4788', '1598', '4299', '2182', '2447', '2470', '7411', '7536', '5806', '1071', '358', '3558', '1822', '3403', '5481', '7074', '7368', '2964', '2513', '5487', '3258', '2615', '1651', '3696', '888', '4035', '6998', '700', '3969', '3693', '959', '5433', '6002', '2689', '1272', '6974', '2288', '3131', '5054', '260', '5506', '22', '2359', '6377', '1332', '2237', '2904', '1428', '4168', '6528', '2000', '2419', '1728', '4982', '4799', '5114', '6099', '940', '52', '3459', '4004', '6809', '5579', '3649', '5752', '7282', '2078', '4856', '1751', '2947', '2227', '2', '7226', '1208', '241', '728', '713', '562', '7406', '323', '1040', '5942', '2707', '3507', '3494', '4266', '4239', '5405', '3216', '6345', '4940', '4596', '246', '4557', '2102', '6335', '3592', '2765', '7312', '4942', '24', '6126', '5210', '4342', '2652', '123', '6506', '2591', '5079', '3677', '5637', '178', '5835', '6429', '1676', '823', '5990', '6211', '2096', '1573', '4254', '1547', '5010', '579', '2930', '7442', '6085', '5592', '2992', '3375', '301', '7109', '2665', '4273', '7216', '804', '468', '7424', '6226', '6408', '2382', '3184', '5193', '214', '4437', '3647', '3371', '1182', '283', '6316', '4973', '1527', '99', '2875', '2672', '5223', '1230', '7289', '6074', '4092', '1096', '7043', '416', '3779', '2100', '5174', '6765', '4410', '3682', '3254', '2953', '2917', '4252', '2690', '4979', '5580', '6533', '3653', '5113', '6237', '2257', '2939', '4606', '6279', '4660', '4083', '2956', '3827', '1013', '539', '2278', '6937', '1594', '40', '6436', '3086', '694', '7093', '3047', '3522', '1802', '720', '4055', '233', '67', '4676', '5485', '1856', '864', '1662', '1142', '2132', '196', '1034', '5507', '7470', '6475', '4048', '1691', '6110', '4206', '3411', '3814', '4255', '1256', '5093', '2535', '1384', '5921', '4574', '4565', '1309', '6697', '503', '1410', '5758', '793', '3057', '1694', '145', '3', '4420', '602', '1275', '6232', '994', '6805', '4774', '3194', '2485', '3698', '1212', '5865', '5312', '3553', '7441', '887', '2457', '605', '1482', '2045', '5152', '1188', '3664', '755', '2559', '3869', '2653', '555', '6198', '2894', '1762', '574', '5890', '4093', '1630', '2141', '6240', '1417', '6988', '7478', '1961', '6031', '975', '5471', '370', '5147', '2634', '6418', '1648', '476', '3251', '5680', '102', '2902', '6848', '4843', '1685', '3338', '3493', '7248', '5353', '122', '6838', '295', '6785', '1047', '1067', '2033', '2455', '1794', '6147', '3485', '1522', '4174', '731', '5895', '2950', '3549', '4', '6784', '421', '741', '6752', '7273', '553', '3535', '6162', '1796', '4085', '4235', '3964', '2571', '7139', '3854', '5207', '2955', '3406', '691', '7141', '762', '2809', '4607', '774', '5798', '1928', '6400', '3286', '2031', '5738', '3518', '2103', '2026', '2781', '6096', '4991', '4681', '2240', '1627', '6156', '1409', '5978', '6130', '4773', '2027', '3940', '7285', '2478', '104', '6', '4549', '4963', '1882', '6439', '1093', '3265', '2239', '2250', '5571', '7005', '7535', '2057', '6292', '5561', '6483', '6481', '3965', '4295', '5939', '5856', '453', '2389', '1485', '1909', '5995', '2110', '6056', '4992', '6328', '6460', '7100', '3140', '6225', '1357', '7301', '336', '3087', '4229', '4103', '570', '5672', '3468', '2111', '1593', '2163', '1716', '324', '3801', '2339', '417', '7094', '1674', '5362', '7342', '89', '7303', '7202', '7243', '5761', '3847', '3567', '1366', '1406', '5073', '573', '1030', '5366', '3538', '6266', '1056', '912', '4771', '2179', '5840', '4950', '2118', '1990', '1455', '3587', '714', '1801', '3499', '3217', '3625', '6041', '4896', '7116', '2966', '76', '3992', '5039', '2973', '4548', '2958', '193', '1873', '6185', '1943', '5549', '3601', '4096', '622', '5456', '5', '7389', '4279', '727', '6212', '2583', '5692', '4288', '1851', '6264', '1033', '684', '7446', '2527', '6348', '6140', '6268', '6539', '6275', '1763', '395', '7187', '2048', '7258', '1082', '2943', '4382', '4897', '724', '6618', '3011', '5751', '6455', '2636', '3340', '1444', '2929', '3115', '1535', '3426', '5546', '4880', '5376', '3788', '4493', '5259', '7236', '5993', '643', '2575', '1826', '540', '2503', '4638', '2601', '3721', '5548', '1433', '2940', '7118', '841', '5040', '2817', '5400', '2552', '7227', '896', '6622', '1416', '4826', '5664', '6700', '282', '545', '1175', '72', '4359', '2834', '1180', '7494', '4875', '1893', '5567', '2149', '2395', '4716', '6679', '1005', '1666', '4481', '7162', '2892', '4817', '4494', '4642', '563', '4610', '2915', '3022', '682', '5372', '6329', '4903', '2679', '2155', '6379', '4300', '779', '1179', '1848', '3759', '5545', '1441', '6259', '1486', '1984', '6035', '5703', '780', '4025', '4696', '2232', '749', '4615', '2659', '3936', '4240', '2426', '4667', '2168', '6696', '707', '2468', '5325', '944', '5981', '4185', '6116', '4689', '3984', '7115', '4899', '7306', '3541', '6023', '6782', '3419', '2927', '222', '681', '3346', '673', '5154', '5115', '267', '235', '5947', '1475', '106', '3277', '360', '4419', '3537', '4668', '3680', '1234', '797', '3900', '835', '528', '4416', '4746', '1161', '3776', '6152', '2294', '795', '3078', '5094', '2037', '6244', '6486', '647', '708', '7', '709', '1102', '4572', '2166', '6406', '4172', '7176', '3583', '2869', '7275', '5476', '110', '3866', '1383', '375', '31', '4639', '7119', '2469', '968', '6849', '7522', '6358', '5932', '4649', '6462', '2297', '4824', '3081', '302', '704', '2413', '3804', '4521', '2946', '3045', '1163', '2752', '5836', '5813', '5067', '5992', '5102', '5831', '2952', '3974', '168', '2285', '7185', '147', '2254', '2905', '963', '59', '1186', '4759', '3728', '5754', '3756', '7291', '2228', '949', '2083', '692', '7288', '261', '7149', '6461', '2117', '2676', '2798', '1105', '4395', '385', '201', '3833', '5772', '3570', '5097', '7277', '6385', '1878', '5246', '3048', '3109', '6932', '4789', '3903', '6918', '386', '4821', '3014', '504', '1894', '3612', '4164', '428', '3412', '1575', '4850', '243', '5256', '2113', '3754', '2864', '2598', '4918', '6213', '3519', '5018', '2194', '7179', '6957', '5418', '1948', '3669', '875', '3075', '6334', '7365', '2162', '1563', '6843', '3449', '6584', '5685', '6113', '4451', '6214', '7215', '5825', '5404', '5696', '6563', '7170', '1636', '174', '3710', '3959', '4050', '4916', '1285', '5556', '7318', '6435', '5452', '3794', '1358', '2507', '82', '4797', '4739', '1157', '4702', '2408', '1020', '706', '1001', '815', '1654', '7022', '1713', '7177', '6593', '6667', '143', '3670', '3557', '2006', '3547', '3708', '589', '1614', '217', '4215', '4297', '3390', '3314', '5727', '6931', '7189', '5262', '4362', '4383', '2606', '5563', '5885', '1923', '5733', '2268', '1236', '3249', '4263', '6850', '6684', '3667', '5008', '5722', '6545', '5451', '2994', '257', '1204', '207', '2607', '5679', '2692', '621', '5851', '5735', '4522', '6594', '1451', '6858', '6370', '3348', '3683', '2253', '858', '3063', '3797', '4360', '2949', '3962', '3520', '2856', '4849', '5608', '7538', '6062', '3973', '501', '209', '2123', '3006', '3947', '1525', '1908', '2404', '2336', '1686', '2129', '346', '303', '2706', '6904', '2854', '3714', '3963', '5774', '4178', '2822', '2089', '2409', '4898', '1313', '4159', '3408', '95', '3767', '1353', '2849', '4062', '4110', '3676', '1754', '93', '3723', '3812', '5601', '1378', '479', '4467', '6839', '1368', '2459', '3808', '1037', '7034', '2944', '5191', '1643', '1752', '4014', '5201', '678', '3138', '3889', '1414', '6771', '6691', '3751', '4133', '5527', '7230', '6921', '7060', '525', '2611', '4311', '2412', '3363', '6353', '5848', '4477', '5280', '1461', '5171', '2718', '6601', '5183', '3931', '3496', '7461', '3439', '620', '164', '1861', '5195', '5155', '1650', '6646', '4580', '7220', '5128', '2703', '3379', '6153', '7345', '2053', '6775', '1543', '286', '6756', '4882', '1831', '2789', '6828', '6958', '4060', '2547', '6866', '1506', '2126', '288', '725', '2418', '5622', '4618', '653', '7120', '1979', '4825', '205', '1962', '3003', '1781', '3165', '4309', '3282', '7174', '5314', '4242', '2206', '7432', '2180', '1190', '1225', '3628', '7297', '3588', '911', '2800', '2381', '2039', '6309', '5086', '3611', '2758', '3655', '6954', '4954', '630', '4879', '6323', '1432', '6608', '2175', '329', '1466', '6727', '2657', '3139', '4730', '863', '3275', '1012', '6499', '5022', '3376', '3306', '916', '6313', '839', '2065', '6960', '5478', '432', '1608', '1682', '6440', '1570', '3234', '1864', '1637', '2284', '5818', '3178', '402', '2697', '3550', '1761', '776', '7526', '251', '3156', '6766', '3933', '7409', '1203', '1298', '999', '1491', '2855', '4492', '5967', '1224', '783', '4517', '1213', '929', '6883', '3785', '1817', '5919', '3673', '7228', '7171', '6534', '3136', '4027', '435', '4809', '4433', '6661', '1498', '3596', '3013', '798', '1084', '2805', '4328', '754', '4735', '2885', '6939', '6776', '1201', '6716', '1874', '1436', '996', '2625', '2989', '2951', '3602', '1442', '2565', '359', '1642', '6793', '6184', '3303', '4189', '7040', '7130', '6492', '1867', '3769', '1041', '668', '1296', '4348', '4171', '3008', '7508', '1823', '4990', '2183', '582', '3671', '7516', '1661', '2377', '2440', '4604', '2629', '5013', '799', '5652', '5924', '3733', '6877', '7138', '5219', '3722', '4513', '4243', '3876', '3591', '2160', '2753', '5962', '5407', '5465', '5807', '2996', '5016', '4000', '7252', '1810', '5808', '42', '5781', '4966', '4156', '5138', '6249', '7184', '407', '5944', '3526', '175', '1000', '3208', '3975', '6001', '7473', '4635', '7237', '4165', '4428', '6556', '2187', '5783', '1004', '1533', '5817', '3991', '2038', '3462', '7048', '5443', '6209', '3492', '1974', '3911', '2569', '3160', '4935', '3315', '1456', '3311', '4198', '6013', '6033', '3534', '2701', '6531', '3604', '7281', '6131', '4051', '4854', '6188', '1481', '6124', '2476', '6294', '1496', '1742', '6576', '506', '103', '3264', '6590', '70', '3445', '6755', '3372', '4798', '6896', '4865', '6277', '1458', '6392', '6744', '117', '3188', '6200', '2492', '2432', '4221', '3551', '7488', '6787', '1912', '1576', '5657', '5249', '3010', '3486', '2025', '1828', '7133', '6480', '3906', '6912', '3142', '3183', '721', '4737', '434', '6470', '4502', '4674', '559', '4179', '2843', '4938', '2493', '4421', '2258', '568', '3735', '6616', '4074', '1254', '4944', '7418', '5349', '1967', '4010', '1503', '4581', '925', '1664', '1983', '1847', '2750', '6634', '5421', '2326', '1935', '1095', '6239', '255', '4016', '5669', '75', '2850', '1489', '2635', '5415', '675', '3572', '3707', '1053', '6520', '1046', '5444', '4957', '2195', '1293', '6686', '7405', '6786', '2712', '3427', '1634', '586', '118', '613', '6219', '953', '124', '2963', '3882', '2922', '1336', '3863', '6026', '551', '6999', '5053', '2823', '6443', '1172', '2291', '837', '1601', '1149', '2003', '2486', '773', '3331', '1887', '5175', '4142', '6580', '1921', '1549', '2537', '6544', '952', '6488', '6761', '4733', '1413', '1321', '5197', '2819', '4079', '5355', '1862', '5409', '4749', '933', '509', '2754', '213', '365', '7334', '5108', '6383', '2550', '2664', '6624', '1993', '6746', '131', '369', '4456', '3935', '3688', '2871', '424', '7101', '1337', '5855', '1162', '5319', '4884', '3204', '3666', '5994', '4997', '4885', '5035', '6090', '6476', '1079', '3505', '2355', '6409', '7262', '1596', '5090', '7209', '7286', '7076', '5991', '6851', '3697', '398', '2613', '6103', '1290', '189', '48', '2806', '4907', '4361', '599', '941', '1108', '5551', '4401', '6712', '5987', '934', '6628', '7146', '4196', '5544', '1387', '1689', '7081', '298', '3954', '6155', '6595', '4343', '7364', '2219', '357', '4984', '4364', '231', '4507', '1954', '1658', '1130', '6984', '451', '4529', '7182', '5337', '6659', '5457', '2043', '461', '2224', '615', '2449', '2836', '5077', '5032', '2460', '685', '1860', '3417', '3660', '1720', '695', '7304', '121', '20', '1445', '4026', '4452', '1852', '2320', '3757', '4449', '6247', '242', '5898', '6190', '4397', '5199', '2982', '5276', '4962', '976', '2816', '3922', '2259', '3025', '2016', '6677', '918', '2623', '3571', '2668', '4999', '3393', '3394', '2794', '5786', '7244', '6208', '2978', '4008', '4585', '5270', '2546', '1377', '7517', '1546', '7315', '821', '50', '1710', '5713', '1809', '6502', '474', '1703', '2292', '190', '4033', '6121', '188', '1404', '1628', '6371', '2088', '6049', '6112', '921', '6022', '3016', '3803', '2534', '3332', '2081', '6685', '6708', '6867', '4915', '114', '6231', '855', '4445', '3334', '2443', '2780', '2262', '3561', '6614', '6365', '3341', '6030', '4415', '1768', '4724', '5500', '2961', '972', '5845', '191', '4619', '7014', '4654', '4694', '1467', '690', '2957', '6372', '4312', '1672', '6100', '2878', '6907', '617', '3124', '3071', '6955', '5304', '1528', '6081', '1032', '3802', '2826', '7114', '6019', '6705', '831', '6604', '7200', '6341', '2352', '4368', '7066', '34', '5131', '139', '7412', '4752', '3218', '2700', '5299', '6901', '6299', '7151', '4158', '4644', '4830', '6331', '1097', '4678', '6204', '6463', '3316', '3806', '5853', '3944', '1240', '3865', '4844', '319', '1222', '6255', '2360', '3843', '575', '1059', '1578', '4608', '2755', '3239', '3237', '5021', '6741', '6647', '2281', '6210', '5388', '4550', '2241', '5740', '3736', '3240', '1799', '6415', '5534', '1673', '2036', '4994', '1568', '1925', '1426', '5387', '898', '1177', '6388', '3566', '4625', '1209', '1789', '4453', '587', '1555', '7416', '1218', '1185', '6177', '4131', '2637', '4422', '5666', '6150', '6527', '5615', '4349', '548', '2731', '1517', '5725', '3822', '6791', '2462', '744', '170', '2830', '4901', '2417', '5958', '4943', '6359', '2347', '4487', '635', '987', '2014', '4134', '4341', '4783', '1216', '4785', '460', '665', '4919', '4812', '2737', '6397', '5350', '1881', '6042', '3135', '3061', '321', '1770', '1147', '4705', '3450', '1490', '909', '2735', '3875', '2609', '202', '696', '6961', '4138', '6368', '4045', '6503', '1009', '1922', '2773', '917', '813', '640', '393', '4485', '1501', '5241', '7413', '3838', '3413', '2441', '588', '1075', '4200', '5742', '4009', '448', '4102', '5263', '6381', '5432', '4088', '1863', '2063', '2675', '6561', '2797', '173', '2398', '5250', '5965', '1652', '310', '7530', '6987', '4137', '74', '2720', '1257', '6082', '2497', '15', '1281', '5062', '4424', '6568', '7217', '1625', '7382', '6154', '1804', '6496', '7267', '1632', '1944', '3490', '6558', '2482', '6159', '1159', '5927', '4905', '6836', '1244', '1246', '436', '2802', '2340', '5867', '1904', '2980', '3593', '2641', '7089', '521', '28', '3857', '1131', '7080', '32', '2751', '1434', '6820', '3624', '3409', '6216', '4533', '866', '2912', '1711', '845', '7274', '5585', '4327', '6950', '2520', '390', '786', '3347', '833', '4338', '6102', '4527', '1493', '2608', '6458', '10', '4904', '3791', '6105', '7323', '215', '7049', '7500', '3453', '4121', '7063', '2311', '2860', '5584', '3543', '7247', '6879', '3525', '3072', '7476', '4633', '3350', '1857', '4579', '6572', '3448', '6564', '4932', '5103', '4528', '5852', '6039', '1559', '6446', '1495', '3828', '4489', '862', '2152', '4389', '3211', '2733', '6546', '305', '5265', '5983', '3573', '5425', '4107', '1305', '5238', '2868', '6129', '146', '5784', '1342', '3423', '3968', '4602', '7201', '1505', '3381', '5586', '6726', '2890', '6523', '1359', '2430', '3452', '392', '3926', '3840', '4186', '6401', '1898', '5058', '6872', '961', '4708', '2150', '1303', '6990', '748', '6547', '2390', '7211', '2921', '6739', '2649', '2587', '5342', '5896', '4041', '966', '2276', '1319', '3919', '1707', '1557', '1541', '3817', '2599', '1135', '5541', '3260', '3858', '7158', '3288', '344', '6138', '1677', '7054', '3748', '6876', '7390', '6942', '1421', '5905', '3958', '2200', '3581', '2199', '2694', '294', '3867', '2670', '3582', '3388', '4765', '4286', '4086', '1581', '6993', '3908', '4001', '4926', '4978', '7064', '5539', '4011', '6844', '6806', '4601', '1830', '65', '2405', '2332', '6373', '2368', '882', '4091', '2774', '594', '6808', '2464', '2314', '1507', '2114', '6880', '4532', '308', '3753', '3179', '3105', '1748', '924', '6300', '477', '7379', '4815', '492', '6824', '6351', '1484', '3001', '1558', '7502', '1427', '2146', '5528', '6075', '6827', '1683', '4530', '1088', '3789', '5080', '609', '1318', '822', '7356', '97', '389', '3399', '6721', '2837', '5996', '1340', '2261', '7087', '1872', '6215', '6434', '4714', '5646', '2907', '1007', '5553', '3470', '2821', '3834', '2886', '1422', '5857', '6422', '4666', '3930', '5771', '4888', '1192', '6028', '5609', '7302', '3846', '3589', '5253', '4621', '3702', '1714', '6599', '5309', '3205', '6451', '2344', '3321', '1499', '2577', '51', '1464', '3786', '5667', '735', '1118', '3291', '3369', '6526', '765', '3849', '2954', '3645', '7472', '2928', '5717', '4344', '5832', '5279', '3760', '236', '1845', '6703', '1443', '3229', '969', '5294', '2573', '5893', '6952', '6330', '5503', '2128', '2322', '4827', '6505', '1450', '7143', '3734', '4071', '4306', '5653', '6269', '3970', '2895', '4384', '3637', '800', '5661', '5343', '4292', '4352', '3089', '2906', '5048', '2627', '5943', '4098', '7266', '6577', '1039', '1187', '4869', '1155', '1113', '5587', '1933', '0', '5642', '6612', '7007', '2933', '6024', '905', '4371', '3761', '4756', '4629', '5812', '7068', '3741', '3879', '4588', '3454', '7398', '1537', '5266', '6653', '256', '4828', '6411', '4972', '1646', '2557', '4948', '6421', '7369', '4538', '6514', '6865', '1837', '5706', '3041', '973', '4064', '4913', '5645', '5373', '6736', '1930', '1237', '1886', '5569', '3995', '3826', '4569', '2588', '4552', '4474', '2361', '1197', '5225', '4073', '1074', '4204', '1975', '3509', '5301', '2605', '5524', '3382', '2333', '4438', '4347', '6963', '5434', '2309', '5820', '3633', '534', '7140', '7135', '655', '2793', '4632', '1382', '4855', '291', '1996', '345', '4370', '2204', '5755', '2225', '4365', '2319', '4682', '6495', '108', '5707', '7004', '142', '3774', '4457', '7351', '5675', '5414', '5643', '27', '6498', '4069', '5170', '5946', '6586', '2451', '6218', '3082', '7347', '2481', '6071', '4573', '4740', '6969', '410', '5973', '4687', '1249', '3024', '7352', '44', '5830', '6554', '5365', '4501', '6516', '846', '2238', '3367', '4154', '7041', '1183', '2678', '7358', '658', '4980', '597', '601', '2579', '1253', '4511', '1469', '3077', '914', '3035', '471', '4084', '4868', '4332', '135', '627', '1795', '5140', '7056', '3527', '2880', '6657', '1597', '5112', '1403', '3511', '3642', '5626', '4920', '3398', '2034', '4253', '716', '3546', '5236', '6276', '2624', '2887', '6936', '567', '1551', '3629', '7030', '5245', '4111', '6873', '6426', '262', '374', '5359', '5313', '6261', '6251', '1202', '2372', '2143', '1663', '237', '4800', '705', '6222', '661', '5749', '6097', '3175', '4385', '5303', '2171', '5148', '3895', '1772', '7260', '3874', '475', '5044', '6676', '4049', '4637', '1988', '6555', '1115', '5384', '7235', '5378', '2215', '2948', '353', '2590', '5163', '6048', '4860', '1310', '5198', '6970', '6797', '4318', '1123', '2158', '5364', '3353', '5173', '1554', '4524', '5794', '271', '6117', '6336', '4600', '2536', '2909', '935', '6456', '7132', '5318', '4099', '1269', '6246', '4591', '4072', '3118', '7397', '664', '3029', '1488', '7343', '6367', '5015', '1472', '7268', '4520', '6826', '5483', '7307', '388', '5143', '5002', '6412', '6789', '7072', '6278', '3559', '49', '1740', '890', '5075', '2532', '3421', '7242', '90', '3663', '7482', '5453', '992', '3296', '2436', '1346', '7205', '1420', '5564', '6494', '4387', '1261', '2059', '6711', '1408', '3098', '576', '1300', '4713', '3597', '2040', '3471', '3091', '6687', '7107', '5846', '2424', '7439', '524', '5878', '2134', '1836', '2002', '2176', '3357', '2618', '5272', '3584', '5682', '3656', '6607', '6519', '7503', '928', '4019', '717', '1776', '4195', '5756', '3556', '4710', '3927', '3122', '2962', '2142', '1600', '4953', '4129', '4376', '200', '3269', '2589', '2263', '7193', '5024', '2186', '1487', '371', '5356', '6550', '1153', '4807', '86', '4425', '6442', '7165', '1439', '4881', '784', '2050', '2245', '2373', '11', '3116', '6814', '6944', '5757', '6603', '5427', '2280', '3480', '4222', '68', '120', '6801', '6891', '608', '4634', '7229', '5891', '5327', '1712', '6315', '4427', '3638', '7487', '2015', '4839', '7511', '3378', '4685', '870', '6652', '1194', '259', '5187', '6933', '638', '834', '1322', '2977', '5244', '3841', '6464', '718', '6923', '77', '7024', '5049', '4290', '6172', '1584', '322', '206', '441', '1119', '7348', '6438', '5844', '1329', '689', '5795', '6270', '4706', '3037', '2585', '4100', '6592', '4701', '348', '1989', '423', '3632', '6717', '860', '4803', '6098', '7463', '3799', '160', '2542', '771', '2452', '769', '5700', '552', '4488', '6813', '5577', '6525', '2744', '3904', '2725', '2357', '6935', '6871', '4964', '5386', '6524', '311', '7466', '494', '4857', '6312', '4320', '3514', '7276', '4264', '2553', '6773', '2085', '4087', '2882', '7481', '1098', '1052', '4813', '6263', '4394', '467', '6975', '1876', '7253', '3386', '6384', '7438', '3261', '6403', '3100', '4101', '2746', '4426', '66', '2302', '5230', '5358', '7246', '5745', '5523', '5268', '3985', '6468', '2433', '6410', '1786', '5875', '2251', '4160', '5146', '376', '4336', '3712', '5437', '4393', '7484', '649', '3877', '4146', '1167', '6803', '4818', '4747', '269', '5031', '5290', '5222', '5849', '4551', '484', '33', '595', '2018', '5670', '7259', '2981', '4406', '3726', '1895', '6910', '3069', '7372', '9', '2932', '5760', '3579', '6743', '7298', '2122', '5490', '1530', '4136', '3060', '4804', '2711', '792', '5116', '6044', '6207', '4605', '876', '1937', '2872', '6428', '6540', '5463', '2651', '1797', '111', '1330', '2900', '3942', '3575', '4227', '3465', '2384', '7166', '4398', '6698', '5172', '4339', '2013', '6778', '1644', '787', '4013', '1808', '6532', '4414', '2108', '7433', '1312', '766', '3619', '2767', '825', '1229', '3410', '1049', '1875', '625', '865', '3773', '5671', '904', '2742', '5252', '1125', '4959', '807', '221', '4141', '2997', '1430', '5412', '7549', '7232', '1354', '598', '5712', '5458', '2643', '2051', '4770', '6859', '1955', '3050', '6344', '6230', '1765', '1446', '6619', '3701', '2881', '4965', '4039', '4772', '6291', '2298', '6180', '6437', '3279', '5768', '1709', '4734', '2593', '4523', '1583', '6742', '4754', '2305', '3820', '4823', '4656', '3180', '1834', '153', '4056', '2719', '1773', '5863', '3085', '1376', '5730', '3446', '6623', '1479', '6205', '3339', '738', '497', '6076', '2353', '3950', '480', '6605', '5759', '4277', '7018', '4791', '5710', '4162', '7388', '616', '2092', '2960', '2084', '3119', '6343', '5676', '7233', '985', '3174', '6363', '4838', '5701', '6339', '6114', '5976', '488', '2645', '2386', '5916', '4890', '3861', '7376', '2861', '5323', '41', '7367', '4247', '3431', '2137', '5629', '5729', '6745', '2554', '5275', '6672', '2916', '1945', '1003', '7279', '6665', '6642', '7337', '1611', '7415', '3199', '3639', '947', '5599', '5038', '880', '3684', '6320', '2338', '618', '3768', '1824', '5045', '1349', '39', '5030', '5110', '3195', '3123', '3881', '3365', '3555', '37', '129', '5734', '801', '2343', '1617', '7518', '7271', '537', '4047', '210', '5232', '1927', '7284', '5633', '6021', '867', '7278', '840', '6068', '984', '2815', '522', '5354', '3467', '7287', '650', '2062', '3384', '7308', '96', '4140', '2093', '6548', '1896', '1136', '6860', '849', '6853', '3830', '3102', '1133', '229', '2159', '5164', '2383', '7316', '4840', '4684', '4631', '2375', '828', '6186', '6484', '7172', '1687', '3302', '819', '3432', '3675', '7270', '5180', '4115', '1226', '1825', '30', '7092', '3831', '6927', '2177', '6589', '71', '6260', '6774', '5764', '5390', '1766', '4112', '3552', '7145', '3730', '4571', '6489', '3913', '4495', '6863', '6648', '2628', '2518', '2009', '5267', '2488', '2516', '5329', '7402', '931', '5565', '5953', '351', '7407', '2888', '5654', '1169', '4331', '5167', '2101', '4210', '1788', '1591', '437', '1282', '1659', '3475', '7338', '5911', '5454', '4058', '4688', '92', '4251', '5240', '6282', '626', '3953', '5248', '3839', '3293', '273', '2403', '1393', '3993', '5879', '5782', '629', '759', '4319', '2301', '199', '2066', '4726', '1675', '2548', '3621', '3539', '1692', '526', '6953', '7429', '109', '1820', '5747', '6077', '6620', '1372', '1798', '1024', '6183', '5766', '5196', '5516', '4201', '6938', '3473', '5141', '2148', '6432', '274', '4388', '6201', '5809', '5888', '4345', '3752', '2600', '1784', '6465', '2671', '3928', '4764', '1701', '1957', '6086', '7474', '1138', '4283', '5161', '6136', '3171', '1', '3222', '861', '2467', '1114', '2001', '6010', '3853', '5446', '1369', '6967', '4067', '5065', '680', '4054', '335', '3738', '4244', '6053', '6670', '6069', '5568', '2287', '46', '1392', '6978', '2077', '3380', '1035', '1771', '7178', '3635', '3389', '6317', '5934', '7437', '4883', '55', '2715', '6945', '5012', '163', '412', '1708', '4316', '5904', '6120', '6951', '6699', '3145', '3892', '7431', '5665', '279', '3033', '1561', '1094', '4303', '4563', '6072', '5069', '3599', '7331', '362', '7450', '396', '272', '4113', '5324', '6709', '1233', '7319', '5611', '2677', '1064', '5060', '3979', '4246', '4170', '5841', '6466', '6302', '21', '2269', '7002', '4075', '633', '5307', '4224', '2969', '6948', '2388', '5072', '2544', '4043', '4947', '1473', '2387', '2833', '3358', '3891', '5630', '3989', '3672', '6034', '2264', '4076', '6241', '2303', '7249', '4161', '493', '4616', '2759', '3463', '508', '2813', '1418', '4537', '1841', '6823', '3191', '6145', '6061', '1723', '6449', '6376', '5394', '814', '6132', '4900', '6763', '6170', '6816', '842', '2213', '3402', '3798', '5357', '7321', '6747', '1911', '1649', '4044', '3074', '5930', '224', '511', '5023', '4613', '6719', '3197', '5157', '1288', '4503', '520', '5597', '636', '5912', '2041', '1463', '5508', '5462', '7430', '6196', '6148', '1316', '4284', '2173', '5419', '63', '6354', '3405', '1982', '4863', '7218', '4540', '130', '5800', '2362', '4808', '3244', '1385', '683', '7263', '624', '4577', '3640', '4886', '2654', '5866', '3026', '5531', '3687', '4664', '6394', '2779', '2896', '5449', '5014', '1992', '3281', '3322', '906', '4208', '3164', '7020', '6732', '2574', '4248', '6095', '3441', '3600', '7204', '6006', '5104', '2630', '2324', '3230', '1019', '1156', '2317', '309', '2082', '4465', '6179', '5224', '2696', '1520', '3437', '4727', '7320', '4564', '1800', '7264', '6169', '3434', '5320', '4132', '4024', '1429', '5763', '7443', '7386', '179', '4446', '1238', '2858', '3031', '1560', '2414', '3352', '5737', '5464', '5158', '1968', '3163', '1942', '7280', '3898', '2617', '6915', '4187', '1454', '2157', '363', '4594', '2165', '5557', '6728', '1104', '772', '4593', '4769', '284', '4670', '6137', '7003', '6029', '877', '2747', '2619', '4472', '1589', '6566', '1437', '3564', '7401', '2845', '3932', '3574', '3301', '5126', '4554', '4977', '2517', '4192', '3387', '4535', '2075', '4945', '3256', '1091', '3513', '1655', '4463', '2095', '1331', '2116', '1552', '4212', '1069', '5985', '1521', '6413', '542', '2442', '3880', '2760', '391', '4711', '1381', '1106', '1621', '5260', '2074', '3027', '2246', '3810', '6585', '7290', '2020', '2807', '275', '6310', '6425', '3337', '702', '4672', '6253', '3699', '47', '6841', '4127', '7471', '5828', '7355', '6767', '4787', '4810', '1607', '3996', '6598', '4400', '3401', '1665', '549', '356', '2216', '3498', '4238', '2684', '3305', '4296', '3888', '3460', '5029', '4379', '6080', '5570', '6051', '2325', '7085', '7052', '4443', '3333', '5204', '4570', '7325', '7009', '5447', '3187', '3250', '1790', '1021', '328', '186', '3678', '6925', '1574', '5748', '7269', '1228', '1259', '5455', '306', '5550', '1585', '4028', '1539', '7521', '1150', '2135', '5923', '6645', '3884', '7456', '3885', '5216', '2193', '7460', '7053', '1534', '1476', '3356', '331', '2201', '1738', '5285', '5509', '1918', '7113', '3685', '6979', '5439', '7136', '6332', '5283', '6146', '4645', '161', '1143', '3872', '666', '2385', '5078', '7504', '593', '2277', '6378', '3909', '2153', '4147', '1280', '2533', '5833', '5027', '2306', '719', '6273', '3092', '989', '1477', '3101', '726', '3497', '7374', '3259', '3201', '5135', '1447', '5982', '2883', '4325', '2971', '6997', '4346', '401', '6416', '7070', '1969', '4435', '87', '4432', '6349', '1134', '2323', '5132', '4404', '148', '4228', '454', '1850', '2660', '4725', '379', '990', '3224', '3114', '2086', '6274', '3292', '5068', '6943', '808', '1207', '1624', '5218', '6926', '4447', '2681', '223', '2704', '997', '1971', '3790', '4518', '5838', '6399', '5408', '3491', '444', '2666', '4367', '4226', '1054', '7435', '687', '7322', '3274', '7384', '5326', '183', '723', '7090', '2445', '7195', '4546', '4322', '6723', '4820', '5843', '1026', '446', '3252', '7314', '4816', '2295', '991', '5100', '703', '3210', '2824', '4669', '4470', '333', '2959', '775', '3658', '4795', '6478', '1333', '2790', '4673', '7035', '836', '1438', '6792', '829', '5061', '4135', '4256', '6911', '6280', '1926', '4271', '2010', '6794', '155', '1415', '1015', '7427', '4468', '7428', '3133', '6597', '3245', '4665'])"
566 | ]
567 | },
568 | "execution_count": 45,
569 | "metadata": {},
570 | "output_type": "execute_result"
571 | }
572 | ],
573 | "source": [
574 | "char2tf.keys()"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 44,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "data": {
584 | "text/plain": [
585 | "dict_keys(['2877', '1027', '5009', '1612', '942', '113', '1774', '5302', '4833', '5070', '381', '172', '2723', '6452', '2349', '7112', '4698', '2416', '3609', '1523', '6224', '4891', '3674', '2312', '4116', '4281', '409', '7293', '458', '1548', '3704', '73', '278', '7231', '6629', '4661', '6710', '2650', '5346', '7173', '6626', '7489', '3532', '995', '2242', '538', '1513', '3852', '6724', '3502', '5559', '2889', '5859', '3524', '737', '1729', '6306', '923', '3548', '4566', '2801', '3981', '491', '4259', '4391', '3516', '874', '2371', '2256', '2094', '7524', '7102', '5871', '3185', '6168', '2342', '3594', '2270', '937', '5124', '6884', '3777', '1301', '701', '4020', '4622', '85', '1668', '5185', '5083', '2509', '1602', '3162', '516', '6655', '1338', '7071', '1577', '585', '3149', '2235', '2844', '1371', '3429', '4308', '1085', '5699', '5321'])"
586 | ]
587 | },
588 | "execution_count": 44,
589 | "metadata": {},
590 | "output_type": "execute_result"
591 | }
592 | ],
593 | "source": [
594 | "testchar2tf.keys()"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 47,
600 | "metadata": {},
601 | "outputs": [
602 | {
603 | "data": {
604 | "text/plain": [
605 | "7549"
606 | ]
607 | },
608 | "execution_count": 47,
609 | "metadata": {},
610 | "output_type": "execute_result"
611 | }
612 | ],
613 | "source": [
614 | "max([int(i) for i in char2tf.keys()])"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 48,
620 | "metadata": {},
621 | "outputs": [
622 | {
623 | "data": {
624 | "text/plain": [
625 | "7524"
626 | ]
627 | },
628 | "execution_count": 48,
629 | "metadata": {},
630 | "output_type": "execute_result"
631 | }
632 | ],
633 | "source": [
634 | "max([int(i) for i in testchar2tf.keys()])"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": null,
640 | "metadata": {},
641 | "outputs": [],
642 | "source": []
643 | }
644 | ],
645 | "metadata": {
646 | "kernelspec": {
647 | "display_name": "Python 3",
648 | "language": "python",
649 | "name": "python3"
650 | },
651 | "language_info": {
652 | "codemirror_mode": {
653 | "name": "ipython",
654 | "version": 3
655 | },
656 | "file_extension": ".py",
657 | "mimetype": "text/x-python",
658 | "name": "python",
659 | "nbconvert_exporter": "python",
660 | "pygments_lexer": "ipython3",
661 | "version": "3.7.6"
662 | }
663 | },
664 | "nbformat": 4,
665 | "nbformat_minor": 4
666 | }
667 |
--------------------------------------------------------------------------------