├── .DS_Store ├── .gitignore ├── bin ├── bin_0_batch.log ├── check.sh ├── check_mini.sh ├── clean.sh ├── compare.sh ├── del.sh ├── deploy.sh ├── deploy1.sh ├── deploy38.sh ├── download.sh ├── exclude.txt ├── init.sh ├── kill.sh ├── main.sh ├── main_manual.sh ├── merge.sh ├── paras.sh ├── paras7.sh ├── paras8.sh ├── predict.sh ├── sync.sh └── test.sh ├── core ├── bert.py ├── conf.py ├── del.py ├── ensemble.py ├── ensemble_new.py ├── feature.py ├── feature_xlnet.py ├── mini.py ├── split.py └── xlnet.py ├── input └── readme.txt ├── notebook ├── .ipynb_checkpoints │ ├── Untitled-checkpoint.ipynb │ └── word_analysis_local-checkpoint.ipynb ├── Untitled.ipynb ├── Untitled1.ipynb ├── Untitled2.ipynb ├── lstm_best.ipynb ├── train_v2.ipynb └── word_analysis_local.ipynb ├── readme.md ├── readme2.md ├── requirements.txt ├── spider ├── gen_file.py └── mi.py ├── tip.md ├── zhtools ├── langconv.py ├── test.py └── zh_wiki.py └── 大数据标签-讯飞.pptx /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flyfoxs/xf_tag/ee3123f10ff884e46084c5c336b4fa792ad741c1/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | input 2 | .DS_Store 3 | .idea 4 | -------------------------------------------------------------------------------- /bin/bin_0_batch.log: -------------------------------------------------------------------------------- 1 | python: can't open file './core/bert.py': [Errno 2] No such file or directory 2 | python: can't open file './core/bert.py': [Errno 2] No such file or directory 3 | python: can't open file './core/bert.py': [Errno 2] No such file or directory 4 | python: can't open file './core/bert.py': [Errno 2] No such file or directory 5 | python: can't open file './core/bert.py': [Errno 2] No such file or directory 6 | -------------------------------------------------------------------------------- /bin/check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHONPATH=/users/hdpsbp/bk/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH 4 | 5 | PATH=/apps/dslab/anaconda/python3/bin:$PATH 6 | 7 | 8 | 9 | #for level in 0.9 1 1.1 1.2 1.4 1.5 0.8 10 | #for level in 1.4 1.5 0.8 11 | for level in 0.75 0.85 0.7 12 | do 13 | #echo python -u core/train.py train_ex {} [] {0:$level, 3:$level, 4:$level, 6:$level, 9:$level} #> log/search_$level.log 2>&1 14 | python -u core/train.py train_ex {} [] \{0:$level,3:$level,4:$level,6:$level,9:$level\} > log/search2_$level.log 2>&1 15 | done 16 | -------------------------------------------------------------------------------- /bin/check_mini.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHONPATH=/users/hdpsbp/bk/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH 4 | 5 | PATH=/apps/dslab/anaconda/python3/bin:$PATH 6 | 7 | 8 | 9 | for bin_count in 8 #20 10 | do 11 | echo $bin_count 12 | python ./core/check.py -L --bin_count $bin_count --gp_name lr_bin_$bin_count \ 13 | > log/bin_"$(hostname)"_$bin_count.log 2>&1 14 | done 15 | # nohup ./bin/check_mini.sh 5 & 16 | -------------------------------------------------------------------------------- /bin/clean.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | 6 | rm ./cache/merge_feature* 7 | rm ./cache/get_final_feature* 8 | python spider/gen_file.py > gen2.log 2>&1 9 | cat ./input/zip/apptype_train.dat_p* > ./input/zip/apptype_train.dat 10 | #./bin/test.sh 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /bin/compare.sh: -------------------------------------------------------------------------------- 1 | cd "$(dirname "$0")" 2 | 3 | cd .. 4 | 5 | #rm -rf cache/get_feature_target*compare*.h5 6 | nohup python -u code_felix/core/compare.py >> compare_"$(hostname)".log 2>&1 & 7 | -------------------------------------------------------------------------------- /bin/del.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | 6 | 7 | for fold in 2 2 2 8 | do 9 | python -u ./core/bert.py --fold=${fold} train_base >> fold_${fold}_"$(hostname)".log 2>&1 10 | done 11 | 12 | -------------------------------------------------------------------------------- /bin/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | 4 | #remote_host="aladdin1@dgx" 5 | remote_host="aladdin1@$1" 6 | remote_dir="~/felix/" 7 | 8 | cd .. 9 | 10 | if [[ -z "$2" ]]; then 11 | rsync -avz --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir 12 | else 13 | rsync -avz $(pwd) $remote_host:$remote_dir 14 | fi 15 | 16 | date 17 | 18 | echo 'upload to:' $remote_host:$remote_dir 19 | echo '====================================' 20 | 21 | #rsync -av ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/ 22 | 23 | 24 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd /apps/ 25 | 26 | 27 | #rsync -av ./input/tmp hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/input 28 | 29 | #rsync -av ./output/sub/?.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/output/sub -------------------------------------------------------------------------------- /bin/deploy1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | 4 | remote_host="root@vm-docker-1" #hdpsbp@ai-prd-04 5 | remote_dir="/apps/felix/" #/users/hdpsbp/felix/ 6 | 7 | cd .. 8 | 9 | if [[ -z "$1" ]]; then 10 | rsync -av --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir 11 | else 12 | rsync -av $(pwd) $remote_host:$remote_dir 13 | fi 14 | 15 | date 16 | 17 | echo $remote_host:$remote_dir 18 | 19 | #rsync -av ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/ 20 | 21 | 22 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd /apps/ -------------------------------------------------------------------------------- /bin/deploy38.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | 4 | remote_host="aladdin1@10.10.20.38" 5 | remote_dir="/home/aladdin1/felix" 6 | 7 | cd .. 8 | 9 | if [[ -z "$1" ]]; then 10 | rsync -avz --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir 11 | else 12 | rsync -avz $(pwd) $remote_host:$remote_dir 13 | fi 14 | 15 | date 16 | 17 | echo $remote_host:$remote_dir 18 | 19 | #rsync -av ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/ 20 | 21 | 22 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd /apps/ 23 | 24 | 25 | #rsync -av ./input/tmp hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/input 26 | 27 | #rsync -av ./output/sub/?.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/output/sub 28 | 29 | -------------------------------------------------------------------------------- /bin/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | cd .. 5 | 6 | ./bin/deploy.sh $1 7 | 8 | remote_host="aladdin1@$1" 9 | remote_dir="~/felix/$(basename "$(pwd)")/*" 10 | 11 | 12 | if [[ -z "$2" ]]; then 13 | rsync -avz --exclude-from './bin/exclude.txt' --max-size=1m $remote_host:$remote_dir ./ 14 | else 15 | rsync -avz --max-size=1m $remote_host:$remote_dir ./ 16 | fi 17 | 18 | date 19 | 20 | echo 'download from:' $remote_host:$remote_dir 21 | -------------------------------------------------------------------------------- /bin/exclude.txt: -------------------------------------------------------------------------------- 1 | .git 2 | *.zip 3 | .idea 4 | **/*.pyc 5 | **/*.h5 6 | **/*.json 7 | **/__pycache__ 8 | **/*.log 9 | **/*.out 10 | **/*.h5 11 | **/*.hdf5 12 | **/*.pkl 13 | **/*.hdf5 14 | **/.DS_Store 15 | **/*.doc* 16 | **/.ipynb_checkpoints 17 | **/log 18 | **/logs 19 | **/nohup.out 20 | **/*.log 21 | output 22 | cache 23 | sub 24 | score 25 | *.pickle 26 | 27 | #input 28 | **/*.csv 29 | **/Tencent* 30 | #**/config*.py 31 | **/*.dat 32 | 33 | **/*remote.ipynb 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /bin/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | 6 | mkdir -p input 7 | mkdir -p output/sub 8 | mkdir -p output/stacking 9 | mkdir -p output/model 10 | mkdir -p notebook 11 | mkdir -p core 12 | 13 | 14 | touch ./core/conf.py 15 | touch ./core/feature.py 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /bin/kill.sh: -------------------------------------------------------------------------------- 1 | kill $(pidof python) -------------------------------------------------------------------------------- /bin/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | ##for i in {1..10}; 6 | #for i in $(seq 0 $1) 7 | 8 | export PYTHONPATH=./:$PYTHONPATH 9 | #if not input $1, default value is 100 10 | for i in $(seq 0 ${1:-100}) 11 | do 12 | for fold in {0..4}; 13 | do 14 | python -u ./core/bert.py --fold=${fold} --batch_size=8 train_base >> fold_${fold}_"$(hostname)".log 2>&1 15 | 16 | done 17 | done 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /bin/main_manual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | for i in {1..100}; 6 | do 7 | echo $i 8 | python -u ./core/bert_manual.py train_base >> manual_batch_bin_0.log 2>&1 9 | done 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /bin/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #PYTHONPATH=/users/hdpsbp/HadoopDir/felix/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH 4 | 5 | #PATH=/apps/dslab/anaconda/python3/bin:$PATH 6 | 7 | 8 | 9 | #rm -rf ./output/blocks/*.csv 10 | 11 | python ./core/merge.py > merge.log 2>&1 12 | -------------------------------------------------------------------------------- /bin/paras.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | if [ $1 = 'v15' ] 6 | then 7 | echo '15' 8 | version=v15 9 | max_bin=0 10 | cut_ratio=0.1 11 | min_len_ratio=0.8 12 | echo ./cache/${version}*.* 13 | rm -rf ./cache/${version}*.* 14 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 15 | for fold in {0..4}; 16 | do 17 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 18 | done 19 | 20 | elif [ $1 = 'v17' ] 21 | then 22 | echo '17' 23 | version=v17 24 | max_bin=2 25 | cut_ratio=0.1 26 | min_len_ratio=0.9 27 | echo ./cache/${version}*.* 28 | rm -rf ./cache/${version}*.* 29 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 30 | for fold in {0..4}; 31 | do 32 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 33 | done 34 | 35 | fi 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /bin/paras7.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | version=v3 6 | max_bin=1 7 | cut_ratio=0.15 8 | min_len_ratio=0.8 9 | echo ./cache/${version}*.* 10 | rm -rf ./cache/${version}*.* 11 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 12 | for fold in {0..4}; 13 | do 14 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 15 | done 16 | 17 | version=v4 18 | max_bin=1 19 | cut_ratio=0.2 20 | min_len_ratio=0.8 21 | echo ./cache/${version}*.* 22 | rm -rf ./cache/${version}*.* 23 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 24 | for fold in {0..4}; 25 | do 26 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 27 | done 28 | 29 | 30 | version=v5 31 | max_bin=2 32 | cut_ratio=0.1 33 | min_len_ratio=0.8 34 | echo ./cache/${version}*.* 35 | rm -rf ./cache/${version}*.* 36 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 37 | for fold in {0..4}; 38 | do 39 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 40 | done 41 | 42 | -------------------------------------------------------------------------------- /bin/paras8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | version=v6 6 | max_bin=1 7 | cut_ratio=0.1 8 | min_len_ratio=0.7 9 | echo ./cache/${version}*.* 10 | rm -rf ./cache/${version}*.* 11 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 12 | for fold in {0..4}; 13 | do 14 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 15 | done 16 | 17 | version=v7 18 | max_bin=1 19 | cut_ratio=0.1 20 | min_len_ratio=0.9 21 | echo ./cache/${version}*.* 22 | rm -rf ./cache/${version}*.* 23 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 24 | for fold in {0..4}; 25 | do 26 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 27 | done 28 | 29 | 30 | 31 | version=v8 32 | max_bin=3 33 | cut_ratio=0.1 34 | min_len_ratio=0.8 35 | echo ./cache/${version}*.* 36 | rm -rf ./cache/${version}*.* 37 | mv ./output/stacking/${version}*.* ./output/bk_stacking/ 38 | for fold in {0..4}; 39 | do 40 | python -u ./core/bert.py --fold=${fold} --version=${version} --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base >> ${version}_fold_${fold}_"$(hostname)".log 2>&1 41 | done 42 | 43 | -------------------------------------------------------------------------------- /bin/predict.sh: -------------------------------------------------------------------------------- 1 | cd "$(dirname "$0")" 2 | 3 | cd .. 4 | best_arg="./imp/best_arg.h5" 5 | if [ -f "$best_arg" ]; then 6 | echo "Already have best args in $best_arg" 7 | else 8 | #生成当前最优参数,存放于目录 ./imp/ 9 | echo "Try go take_snapshotf for best args, and save in $best_arg" 10 | python ./core/check.py > snap_args.log 2>&1 11 | fi 12 | 13 | #提前对一些分析数据准备好本地缓存 14 | python ./core/feature.py > feature_prepare.log 2>&1 15 | 16 | #根据生成的最优参数,预测缺少数据 17 | python ./core/merge.py --genfile > predict_block.log 2>&1 18 | 19 | -------------------------------------------------------------------------------- /bin/sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd "$(dirname "$0")" 3 | 4 | cd .. 5 | 6 | 7 | rsync -av ./score/blks/ hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks 8 | rsync -av hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks ./score/blks/ 9 | 10 | 11 | rsync -av ./output/blocks/ hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks 12 | rsync -av hdpsbp@ai-prd-05:/users/hdpsbp/felix/kdd_bd ./output/blocks/ 13 | 14 | 15 | rsync -av hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/imp ./ 16 | 17 | date 18 | 19 | -------------------------------------------------------------------------------- /bin/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | cd .. 4 | 5 | #for xx in {0..2}; 6 | #do 7 | # python -u spider/mi.py bd > bd.log 2>&1 8 | # python -u spider/mi.py wdj > wdj.log 2>&1 9 | # python -u spider/mi.py xm > xm.log 2>&1 10 | # 11 | # python -u spider/mi.py tx_pkg > tx_pkg.log 2>&1 12 | # python -u spider/mi.py tx_name > tx_name.log 2>&1 13 | # 14 | # python spider/gen_file.py > gen_file.log 2>&1 15 | #done 16 | 17 | for xx in {0..2}; 18 | do 19 | for fold in 4 3 2 1 0 20 | do 21 | python -u ./core/bert.py --fold=${fold} train_base >> fold_${fold}_"$(hostname)".log 2>&1 22 | done 23 | 24 | done 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /core/bert.py: -------------------------------------------------------------------------------- 1 | 2 | from multiprocessing import Process 3 | 4 | 5 | 6 | 7 | 8 | from core.feature import * 9 | from core.conf import * 10 | 11 | import os 12 | 13 | #os.environ["CUDA_VISIBLE_DEVICES"] = "2" 14 | os.environ['TF_KERAS'] = '1' 15 | 16 | oof_prefix = get_args().version 17 | SEQ_LEN = get_args().seq_len #randrange(128, 180) #-randrange(0, 5)*8 18 | BATCH_SIZE = get_args().batch_size 19 | 20 | #Batch size, MAX_len+ex_length, Manual, Manual GP feature cnt, frac 21 | @lru_cache() 22 | @timed() 23 | def get_train_test_bert(): 24 | 25 | frac = get_args().frac 26 | max_bin = get_args().max_bin 27 | min_len = int(SEQ_LEN*get_args().min_len_ratio) 28 | 29 | data = get_feature_bert(SEQ_LEN) 30 | 31 | #Keep all the bin group, if it's test data 32 | data = data.loc[(data.bin<=max_bin) | (pd.isna(data.type_id))] 33 | 34 | with timed_bolck(f'Remove gan data, and len is less then {min_len}'): 35 | data = data.loc[ (data.bin == 0) | (data['len_'] >= min_len) ] 36 | logger.info(f'Train max_bin:{max_bin},Total Bin distribution:\n{data.bin.value_counts().sort_index()}') 37 | 38 | data = data.sort_index() 39 | logger.info(f'Head of the data:\n, {data.iloc[:3,:3]}') 40 | 41 | train_data = data.loc[pd.notna(data.type_id)].sample(frac=frac, random_state=2019) 42 | labels = train_data.type_id.values.tolist() 43 | logger.info(f'Train Bin distribution:\n{train_data.bin.value_counts().sort_index()}') 44 | 45 | test_data = data.loc[pd.isna(data.type_id)].sample(frac=1, random_state=2019) 46 | 47 | trial = get_args().trial 48 | logger.info(f'Test Bin distribution#{trial}:\n{test_data.bin.value_counts().sort_index()}') 49 | 50 | if trial > 0: 51 | test_data = test_data.loc[test_data.index.str[-1]=='0'] 52 | 53 | 54 | logger.info(f'Train:{train_data.shape} Test#{trial}:{test_data.shape}, frac:{frac}') 55 | 56 | feature_col = [col for col in data.columns if col.startswith('fea_') or col.startswith('bert_')] 57 | 58 | label2id, id2label = get_label_id() 59 | #word2id = get_word2id() 60 | 61 | # Encode input words and labels 62 | X = train_data.loc[:, feature_col] 63 | Y = [label2id[label] for label in labels] 64 | 65 | 66 | X_test = test_data.loc[:, feature_col] 67 | 68 | 69 | return X, pd.Series(Y, index=train_data.index), X_test 70 | 71 | 72 | # X, y, X_test = get_train_test_bert(0.1) 73 | # 74 | # 75 | # train_x, train_y = load_data(train_path) 76 | # test_x, test_y = load_data(test_path) 77 | 78 | def boost_train(boost=10): 79 | for _ in range(boost): 80 | p = Process(target=train_base) 81 | p.start() 82 | p.join() 83 | 84 | 85 | @timed() 86 | def filter_short_desc(X, y): 87 | X = X.copy().reset_index() 88 | bert_cols = [col for col in X.columns if str(col).startswith('bert_')] 89 | bert = X.loc[:, bert_cols] 90 | bert_len = bert.where(bert > 0).count(axis=1) 91 | old_len = len(bert_len) 92 | min_len = int(SEQ_LEN*get_args().min_len_ratio) 93 | bert_len = bert_len.loc[bert_len >= min_len] 94 | logger.info(f'Filter {old_len - len(bert_len)} records from {old_len} by threshold {min_len}') 95 | 96 | return X.iloc[bert_len.index], y[bert_len.index] 97 | 98 | 99 | @timed() 100 | def train_base(): 101 | args = get_args() 102 | #frac = args.frac 103 | fold = args.fold 104 | EPOCHS = args.epochs 105 | 106 | 107 | LR = 2e-5 108 | 109 | with timed_bolck(f'Prepare train data#{BATCH_SIZE}, LR:{LR}'): 110 | X, y, _ = get_train_test_bert() 111 | 112 | ##Begin to define model 113 | from keras_bert import load_trained_model_from_checkpoint 114 | 115 | logger.info(f'Start to train base on checkpoint:{config_path}') 116 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=SEQ_LEN, ) 117 | 118 | for l in bert_model.layers: 119 | l.trainable = True 120 | from tensorflow.python import keras 121 | #from keras_bert import calc_train_steps 122 | 123 | x1_in = keras.layers.Input(shape=(None,)) 124 | x2_in = keras.layers.Input(shape=(None,)) 125 | 126 | x = bert_model([x1_in, x2_in]) 127 | 128 | x = keras.layers.Lambda(lambda x: x[:, 0])(x) 129 | 130 | p = keras.layers.Dense(num_classes, activation='sigmoid')(x) 131 | 132 | #from keras import Model 133 | model = keras.models.Model([x1_in, x2_in], p) 134 | 135 | 136 | model.compile( 137 | optimizer=keras.optimizers.Adam(lr=LR), # AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), 138 | loss='categorical_crossentropy', 139 | metrics=['accuracy'], 140 | ) 141 | model.summary() 142 | ##End to define model 143 | 144 | input1_col = [col for col in X.columns if str(col).startswith('bert_')] 145 | input2_col = [col for col in X.columns if str(col).startswith('fea_')] 146 | #max_words = len(input1_col) 147 | model #= get_model(max_words) 148 | 149 | #get_feature_manual.cache_clear() 150 | Y_cat = keras.utils.to_categorical(y, num_classes=num_classes) 151 | #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) 152 | 153 | with timed_bolck(f'Training#{fold}'): 154 | from core.split import split_df_by_index 155 | train_idx, test_idx = split_df_by_index(X,fold) 156 | 157 | logger.info(f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}') 158 | train_x, train_y, val_x, val_y = \ 159 | X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx] 160 | 161 | logger.info(f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ') 162 | 163 | #train_x, train_y = filter_short_desc(train_x, train_y) 164 | 165 | input1 = train_x.loc[:, input1_col]#.astype(np.float32) 166 | input2 = np.zeros_like(input1)#.astype(np.int8) 167 | 168 | logger.info(f'NN train_x:{train_x[:3]}') 169 | min_len_ratio = get_args().min_len_ratio 170 | max_bin = get_args().max_bin 171 | logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}, SEQ_LEN:{SEQ_LEN}, min_len_ratio:{min_len_ratio}, bin:{max_bin} ') 172 | 173 | from keras_bert import get_custom_objects 174 | import tensorflow as tf 175 | with tf.keras.utils.custom_object_scope(get_custom_objects()): 176 | his = model.fit([input1, input2], train_y, 177 | validation_data = ([val_x.loc[:, input1_col], np.zeros_like(val_x.loc[:, input1_col])], val_y), 178 | epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, 179 | callbacks=[Cal_acc( val_x, y.iloc[test_idx] )] 180 | #steps_per_epoch=1000, validation_steps=10 181 | ) 182 | 183 | 184 | 185 | #gen_sub(model, X_test, sn) 186 | 187 | return his 188 | 189 | from tensorflow.python.keras.callbacks import Callback 190 | class Cal_acc(Callback): 191 | 192 | def __init__(self, val_x, y): 193 | super(Cal_acc, self).__init__() 194 | self.val_x , self.y = val_x, y 195 | self.min_len = int(SEQ_LEN*get_args().min_len_ratio) 196 | self.max_bin = get_args().max_bin 197 | self.fold = get_args().fold 198 | self.threshold = 0 199 | self.feature_len = self.val_x.shape[1] 200 | self.cur_epoch = 0 201 | self.version = get_args().version 202 | self.trial = get_args().trial 203 | 204 | self.max_score = 0 205 | 206 | self.score_list = np.zeros(get_args().epochs) 207 | self.gen_file = False 208 | 209 | import time, os 210 | self.batch_id = round(time.time()) 211 | self.model_folder = f'./output/model/{self.batch_id}/' 212 | 213 | os.makedirs(self.model_folder) 214 | 215 | 216 | #logger.info(f'Cal_acc base on X:{self.X.shape}, Y:{self.y.shape}') 217 | 218 | #@timed() 219 | def cal_acc(self): 220 | input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')] 221 | #input2_col = [col for col in self.val_x.columns if str(col).startswith('fea_')] 222 | #model = self.model 223 | tmp_val = self.val_x.loc[:,input1_col] 224 | tmp_y = self.y 225 | val = self.model.predict([tmp_val, np.zeros_like(tmp_val)]) 226 | 227 | label2id, id2label = get_label_id() 228 | val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index) 229 | val['label'] = tmp_y.astype(int).replace(id2label).astype(int) 230 | val['bin'] = pd.Series(val.index).str[-1].values.astype(int) 231 | #logger.info(f'Head val#label:\n{val.label.head()}') 232 | res_val = val.copy() 233 | # res_val.to_pickle(f'./output/tmp_res_val.pkl') 234 | # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl') 235 | 236 | num_labels = 10 237 | df_score = val.loc[val.bin==0] 238 | score_list = accuracy(df_score, num_labels, f'no{self.cur_epoch},b{self.max_bin},{self.version}') 239 | 240 | logger.info(f'{len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}') 241 | 242 | return score_list,res_val 243 | 244 | @timed() 245 | def cal_acc_ex(self): 246 | input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')] 247 | 248 | if self.trial==0: 249 | check_type_list =['val'] 250 | for type_ in tqdm(check_type_list,desc='cal_acc_ex'): 251 | tmp_val ,tmp_y = self.get_tmp_val_test(type_) 252 | tmp_val = tmp_val.loc[:, input1_col] 253 | 254 | val = self.model.predict([tmp_val, np.zeros_like(tmp_val)]) 255 | 256 | label2id, id2label = get_label_id() 257 | val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index) 258 | val['label'] = tmp_y.astype(int).replace(id2label).astype(int) 259 | val['bin'] = pd.Series(val.index).str[-1].values.astype(int) 260 | # logger.info(f'Head val#label:\n{val.label.head()}') 261 | res_val = val.copy() 262 | # res_val.to_pickle(f'./output/tmp_res_val.pkl') 263 | # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl') 264 | 265 | num_labels = 10 266 | df_score = val.loc[val.bin == 0] 267 | score_list = accuracy(df_score, num_labels, f'ex{self.cur_epoch},{self.version},b{self.max_bin},{type_}') 268 | 269 | logger.info(f'===cal_acc_ex{self.cur_epoch}:{type_}==={len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}') 270 | 271 | return score_list, res_val 272 | 273 | 274 | @lru_cache() 275 | @timed() 276 | def get_tmp_val_test(self, type_): 277 | _, _, test_all = get_train_test_bert() 278 | 279 | test = test_all.loc[pd.Series(test_all.index).str.startswith(type_).values] 280 | 281 | test = test.loc[(pd.Series(test.index).str[-1]=='0').values] 282 | 283 | logger.info(f'Split {type_}, {len(test)} rows from {len(test_all)}') 284 | 285 | test=test.copy() 286 | type_ = 'x'*6 + pd.Series(test.index).str[:6] 287 | test.index = 'x'*6 + pd.Series(test.index).str[6:] 288 | 289 | from spider.mi import get_train_ph2_index 290 | train_ph2 = get_train_ph2_index() 291 | #final = final.loc[final.type_id.str.len() >= 1] 292 | train_ph2.index = 'x'*6 + train_ph2['id'].str[6:] 293 | #Align label with input test 294 | index_old = test.index.copy() 295 | test.index = pd.Series(test.index).apply(lambda val: val[:32]) 296 | 297 | label = train_ph2.type_id.loc[test.index.values].str[:6] #type_id len is 6 298 | 299 | #Rollback index change 300 | test.index = index_old 301 | label.index = index_old 302 | 303 | test = test.loc[pd.notna(label).values] 304 | label = label.dropna() 305 | print('test, label, type_', test.shape, label.shape, type_.shape) 306 | return test, label#, type_ 307 | 308 | 309 | def on_train_end(self, logs=None): 310 | grow= max(self.score_list) - self.threshold 311 | cut_ratio = get_args().cut_ratio 312 | logger.info(f'Train END: Fold:{self.fold}, max:{max(self.score_list):7.6f}/{grow:+6.5f}, at {np.argmax(self.score_list)}/{len(self.score_list)-1}, his:{self.score_list}, max_bin:{self.max_bin}, cut:{cut_ratio}, min_len:{self.min_len:03}, SEQ_LEN:{SEQ_LEN:03}, threshold:{self.threshold:7.6f}, gen_file:{self.gen_file}') 313 | logger.info(f'Input args:{get_args()}') 314 | 315 | def on_epoch_end(self, epoch, logs=None): 316 | self.cur_epoch = epoch 317 | print('\n') 318 | _, _ = self.cal_acc_ex() 319 | 320 | if self.trial > 0: 321 | return 0 322 | else: 323 | score_list, val = self.cal_acc() 324 | total = score_list[1] 325 | 326 | self.score_list[epoch] = round(total, 6) 327 | #threshold_map = {0:0.785, 1:0.77, 2:0.77, 3:0.77, 4:0.78} 328 | top_cnt =2 329 | top_score = self._get_top_score(self.fold)[:top_cnt] 330 | self.threshold = top_score[0] if len(top_score) > 0 else 0 331 | logger.info(f'The top#{top_cnt} score for max_bin:{get_args().max_bin}, epoch:{epoch}, oof:{oof_prefix}, fold#{self.fold} is:{top_score}, cur_score:{total}, threshold:{self.threshold}') 332 | if ( round(total,4) > round(self.threshold,4) 333 | and (epoch>=3 or self.threshold > 0 ) 334 | and total > self.max_score 335 | ) : 336 | #logger.info(f'Try to gen sub file for local score:{total}, and save to:{model_path}') 337 | self.gen_file=True 338 | grow = max(self.score_list) - self.threshold 339 | logger.info(f'Fold:{self.fold}, epoch:{epoch}, MAX:{max(self.score_list):7.6f}/{grow:+6.5f}, threshold:{self.threshold}, score_list:{self.score_list}' ) 340 | test = self.gen_sub(self.model, f'{self.feature_len}_{total:7.6f}_{epoch}_f{self.fold}') 341 | len_raw_val = len(val.loc[val.bin == 0]) 342 | min_len_ratio = get_args().min_len_ratio 343 | oof_file = f'./output/stacking/{oof_prefix}_{self.fold}_{total:7.6f}_{len_raw_val}_{len(val):05}_b{get_args().max_bin}_e{epoch}_{self.batch_id}_m{min_len_ratio:2.1f}_L{SEQ_LEN:03}.h5' 344 | self.save_stack_feature(val, test, oof_file) 345 | else: 346 | logger.info(f'Epoch:{epoch}, only gen sub file if the local score >{self.threshold}, current score:{total}, threshold:{self.threshold}, max_score:{self.max_score}') 347 | 348 | self.max_score = max(self.max_score, total, 0.82) 349 | 350 | logger.info(f'Epoch#{epoch} END,max_bin:{get_args().max_bin}, oof:{oof_prefix}, max:{self.max_score:6.5f}, score:{score_list}, Fold:{self.fold},') 351 | 352 | print('\n') 353 | 354 | return round(total, 5) 355 | 356 | @staticmethod 357 | @timed() 358 | def save_stack_feature(train: pd.DataFrame, test: pd.DataFrame, file_path): 359 | train.bin = train.bin.astype(int) 360 | test.bin = test.bin.astype(int) 361 | train.to_hdf(file_path, 'train', mode='a') 362 | test.to_hdf(file_path, 'test', mode='a') 363 | logger.info(f'OOF file save to :{file_path}') 364 | return train, test 365 | 366 | 367 | @timed() 368 | #./output/model/1562899782/model_6114_0.65403_2.h5 369 | def gen_sub(self, model , info='bert_' , partition_len = 5000): 370 | 371 | #frac = get_args().frac 372 | _, _, test = get_train_test_bert() 373 | 374 | label2id, id2label = get_label_id() 375 | input1_col = [col for col in test.columns if str(col).startswith('bert_')] 376 | input3_col = [col for col in test.columns if str(col).startswith('fea_')] 377 | 378 | logger.info(f'Input input1_col:{len(input1_col)}, input3_col:{len(input3_col)}') 379 | res_list = [] 380 | for sn in tqdm(range(1+ len(test)//partition_len), desc=f'{info}:sub:total:{len(test)},partition_len:{partition_len}'): 381 | tmp = test.iloc[sn*partition_len: (sn+1)*partition_len] 382 | #print('\nbegin tmp\n', tmp.iloc[:3,:3].head()) 383 | res = model.predict([ tmp.loc[:,input1_col], np.zeros_like(tmp.loc[:,input1_col]) ]) 384 | res = pd.DataFrame(res, columns=label2id.keys(), index=tmp.index) 385 | #print('\nend tmp\n', res.iloc[:3, :3].head()) 386 | res_list.append(res) 387 | 388 | res = pd.concat(res_list) 389 | res['bin'] = res.index.str[-1].values.astype(int) 390 | raw_predict = res.copy() 391 | 392 | with timed_bolck(f'Try to gen sub file for fold#{self.fold}'): 393 | #print('\nafter concat\n', res.iloc[:3, :3].head()) 394 | res['id'] = res.index 395 | res.index.name = 'id' 396 | # res.to_pickle(f'./output/tmp_sub.pkl') 397 | 398 | 399 | #print('\nend res\n', res.iloc[:3, :3].head()) 400 | 401 | 402 | 403 | res_mean = res.copy(deep=True) 404 | res_mean['id'] = res_mean.id.apply(lambda val: val.split('_')[0]) 405 | res_mean.index.name = 'index' 406 | res_select = res_mean.groupby('id')['bin'].agg({'bin_max': 'max'}) 407 | res_select.head() 408 | res_select = res_select.loc[res_select.bin_max == 3] 409 | res_mean = res_mean.loc[(res_mean.bin == 0) 410 | | ((res_mean.bin == 1) & (res_mean.id.isin(res_select.index))) 411 | ] 412 | logger.info(f'Try to cal avg for res_mean:\n{res_mean.bin.value_counts()}') 413 | res_mean_len = len(res_mean) 414 | res_mean = res_mean.groupby('id').mean().sort_index() 415 | del res_mean['bin'] 416 | 417 | 418 | res_0 = res.copy(deep=True) 419 | res_0 = res_0.loc[res_0.bin == 0] 420 | res_0.index = res_0.id.apply(lambda val: val.split('_')[0]) 421 | #print('\nres_0\n', res_0.loc[:, ['id', 'bin']].head(3)) 422 | res_0 = res_0.sort_index() 423 | res_0 = res_0.drop(columns=['id','bin'], axis=1, errors='ignore') 424 | 425 | for name, res in [('single',res_0), (f'mean_{res_mean_len}', res_mean)]: 426 | res = res.copy() 427 | #logger.info(f'{name} Check:\n{res.iloc[:3,:num_classes].sum(axis=1)}') 428 | 429 | res['label1'] = res.iloc[:, :num_classes].idxmax(axis=1) 430 | 431 | # Exclude top#1 432 | for index, col in res.label1.items(): 433 | res.loc[index, col] = np.nan 434 | 435 | res['label2'] = res.iloc[:, :num_classes].idxmax(axis=1) 436 | 437 | 438 | for col in ['label1','label2']: 439 | res[col] = res[col].replace(id2label) 440 | 441 | # info = info.replace('.','') 442 | # sub_file = f'./output/sub/v19_{info}_{name}.csv' 443 | # res[['label1', 'label2']].to_csv(sub_file) 444 | # logger.info(f'Sub file save to :{sub_file}') 445 | 446 | #logger.info(f'res_0 Check:\n{res_0.iloc[:3, :num_classes].sum(axis=1)}') 447 | 448 | return raw_predict #res.drop(columns=['id','bin'], axis=1, errors='ignore') 449 | 450 | @staticmethod 451 | def _get_top_score(fold): 452 | from glob import glob 453 | file_list = sorted(glob(f'./output/stacking/{oof_prefix}_{fold}_*.h5'), reverse=True) 454 | score_list = [float(file.split('_')[2].replace('.h5', '')) for file in file_list] 455 | logger.info(f'Score list for fold#{fold} is {score_list}') 456 | return score_list if score_list else [0] 457 | 458 | if __name__ == '__main__': 459 | FUNCTION_MAP = {'train_base': train_base, 460 | } 461 | 462 | args = get_args() 463 | 464 | func = FUNCTION_MAP[args.command] 465 | func() 466 | 467 | """ 468 | 469 | nohup python -u ./core/bert.py --frac=0.1 train_base > test.log 2>&1 & 470 | 471 | nohup python -u ./core/bert.py --fold=4 --max_bin=2 train_base > test_4.log 2>&1 & 472 | 473 | python -u ./core/bert.py --max_bin=2 train_base 474 | 475 | nohup python -u ./core/bert.py train_base > test.log 2>&1 & 476 | 477 | nohup python -u ./core/bert.py train_base > extend_bert_mean_bin_1.log 2>&1 & 478 | 479 | nohup python -u ./core/bert.py boost_train 10 >> boost_1.log 2>&1 & 480 | 481 | """ -------------------------------------------------------------------------------- /core/conf.py: -------------------------------------------------------------------------------- 1 | from random import randrange 2 | 3 | input_dir = './input/zip/' 4 | 5 | type_dict = { 6 | 'type_id': 'str', 7 | 8 | } 9 | 10 | word2vec_tx, vector_size = './input/Tencent_AILab_ChineseEmbedding.txt', 200 11 | 12 | word2vec_tx_mini = './input/mini_tx.kv' 13 | 14 | num_classes = 126 #get_label_id() 15 | 16 | 17 | bert_wv = "./input/bert.kv" 18 | ####Bert Config 19 | import os 20 | pretrained_path = '/users/hdpsbp/HadoopDir/felix/xf_tag/input/roebert' #'./input/model/chinese_L-12_H-768_A-12' 21 | 22 | if not os.path.exists(pretrained_path): 23 | pretrained_path = '/home/aladdin1/felix/robert' 24 | 25 | #pretrained_path = './input/model/chinese_wwm_ext_L-12_H-768_A-12' 26 | config_path = os.path.join(pretrained_path, 'bert_config_large.json') 27 | checkpoint_path = os.path.join(pretrained_path, 'roberta_zh_large_model.ckpt') 28 | vocab_path = os.path.join(pretrained_path, 'vocab.txt') 29 | 30 | check_type_list = ['stb', '50', '100', '200', '300','1000', 31 | #'a2', 'a3', 'bd', 32 | ] 33 | 34 | ####### 35 | 36 | xlnet_path='/users/hdpsbp/HadoopDir/felix/xlnet' -------------------------------------------------------------------------------- /core/del.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import numpy as np 5 | 6 | from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI 7 | 8 | 9 | 10 | '''Can be found at https://github.com/ymcui/Chinese-PreTrained-XLNet''' 11 | checkpoint_path = '/users/hdpsbp/HadoopDir/felix/xlnet' 12 | vocab_path = os.path.join(checkpoint_path, 'spiece.model') 13 | config_path = os.path.join(checkpoint_path, 'xlnet_config.json') 14 | model_path = os.path.join(checkpoint_path, 'xlnet_model.ckpt') 15 | 16 | # Tokenize inputs 17 | tokenizer = Tokenizer(vocab_path) 18 | text = "给岁月以文明" 19 | tokens = tokenizer.encode(text) 20 | 21 | -------------------------------------------------------------------------------- /core/ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from core.conf import * 4 | from core.feature import * 5 | 6 | static_list = [ 7 | # './output/stacking/v6_0_0.804024_6006_10605_b1_e1_m50.h5', 8 | # './output/stacking/v6_0_0.804532_6006_10605_b2_e1_m50.h5', 9 | # './output/stacking/v6_1_0.789411_5918_10451_b2_e1_m50.h5', 10 | # './output/stacking/v6_1_0.792143_5918_10451_b1_e1_m50.h5', 11 | # './output/stacking/v6_2_0.790876_6237_11068_b1_e1_m50.h5', 12 | # './output/stacking/v6_2_0.791542_6237_11068_b1_e1_m50.h5', 13 | # './output/stacking/v6_3_0.799421_5996_10562_b1_e1_m50.h5', 14 | # './output/stacking/v6_3_0.801635_5996_10562_b1_e1_m50.h5', 15 | # './output/stacking/v6_4_0.765271_6977_12388_b4_e1_m20.h5', 16 | # './output/stacking/v6_4_0.766215_6977_06977_b0_e1_m20.h5', 17 | ] 18 | @lru_cache() 19 | def get_top_file(fold,version): 20 | from glob import glob 21 | file_list = sorted(glob(f'./output/stacking/{version}_{fold}_*.h5'), reverse=True) 22 | 23 | if static_list: 24 | file_list = [ file for file in file_list if file in static_list] 25 | return file_list 26 | 27 | @lru_cache() 28 | def get_file_list(version, top=2,): 29 | file_list = [] 30 | for fold in range(5): 31 | tmp = get_top_file(fold, version) 32 | if len(tmp) < top: 33 | logger.warning(f'At least need {top} files for fold:{fold}') 34 | file_list = file_list + tmp[:top] 35 | return tuple(file_list) 36 | 37 | @lru_cache() 38 | @timed() 39 | def get_feature_oof(file_list, weight=1,base_train=True): 40 | 41 | train_list = [] 42 | test_list = [] 43 | 44 | for file in tqdm(file_list,f'gen oof from {len(file_list)} files'): 45 | cur_weight = weight if weight > 0 else get_best_weight(file, base_train=base_train) 46 | 47 | #Train begin 48 | tmp = pd.read_hdf(file, 'train') 49 | col_list = tmp.columns[:num_classes] 50 | tmp['app_id'] = tmp.index.str[:32].values 51 | tmp['bin'] = tmp.index.str[-1].values.astype(int) 52 | tmp = tmp.sort_values(['app_id', 'bin', 'label']) 53 | tmp = tmp.drop_duplicates(['app_id', 'bin']) 54 | 55 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight 56 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight) 57 | tmp.label = tmp.label.astype(int) 58 | tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean() 59 | 60 | train_list.append(tmp) 61 | 62 | #Test begin 63 | tmp = pd.read_hdf(file, 'test') 64 | tmp['app_id'] = tmp.index.str[:32].values 65 | tmp['bin'] = tmp.index.str[-1].values.astype(int) 66 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight 67 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight) 68 | tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean() 69 | test_list.append(tmp) 70 | 71 | train = pd.concat(train_list) 72 | 73 | test = pd.concat(test_list) 74 | 75 | oof = pd.concat([train, test]) 76 | print('oof, before=', oof.shape) 77 | oof = oof.groupby(oof.index).mean() 78 | print('oof, after=', oof.shape) 79 | del oof['bin'] 80 | oof.label = oof.label.fillna(0).astype(int).astype(str) 81 | return oof 82 | 83 | @timed() 84 | def gen_sub_file(res, file_name, topn=2): 85 | res = res.copy() 86 | res_raw = res.copy() 87 | 88 | for i in tqdm(range(1, 1+topn), desc=f'Cal label#1-{topn} value for res:{res.shape}'): 89 | res.loc[:, f'label{i}'] = res.iloc[:, :num_classes].idxmax(axis=1) 90 | res_raw.loc[:, f'label{i}'] = res.loc[:, f'label{i}'] 91 | 92 | for index, col in res[f'label{i}'].items(): 93 | res.loc[index, col] = np.nan 94 | 95 | 96 | if file_name: 97 | from spider.mi import get_train_ph2_index 98 | train_ph2 = get_train_ph2_index() 99 | 100 | res_bk = res.copy().loc[~res.index.str[6:].isin(train_ph2.id.str[6:].values)] 101 | for res in [res, res_bk]: 102 | res.index.name = 'id' 103 | sub_file = f'./output/sub/{len(res)}_{file_name}' 104 | res[['label1', 'label2']].to_csv(sub_file) 105 | logger.info(f'Sub file save to :{sub_file}') 106 | 107 | return res_raw 108 | 109 | 110 | 111 | @timed() 112 | def get_best_weight(file, base_train): 113 | import pandas as pd 114 | if base_train: 115 | df = pd.read_hdf(file, 'train') 116 | else: 117 | df = pd.read_hdf(file, 'test') 118 | 119 | from spider.mi import get_train_ph2_index 120 | ph2_train = get_train_ph2_index() 121 | ph2_train = ph2_train.set_index('id') 122 | df = df.loc[pd.Series(df.index).str[:32].isin(ph2_train.index).values] 123 | df['label'] = ph2_train.loc[pd.Series(df.index).str[:32]].type_id.str[:6].values.astype(int) 124 | 125 | 126 | df['bin'] = df.index.str[-1].astype(int) 127 | 128 | col_list = df.columns[:num_classes] 129 | #print(col_list) 130 | df['bin'] = df.index.str[-1].astype(int) 131 | df['app_id'] = df.index.str[:32] 132 | 133 | if len(df.loc[df.bin==1]) ==0 : 134 | return 1 135 | 136 | print(df.bin.value_counts()) 137 | df = df.sort_values(['app_id', 'bin', 'label']) 138 | df = df.drop_duplicates(['app_id', 'bin']) 139 | 140 | score ={} 141 | 142 | for weight in tqdm(np.arange(0.7, 1.01, 0.05), desc=f'Cal best for {file}'): 143 | weight = round(weight, 2) 144 | tmp = df.copy() 145 | # print(tmp.label.head(3)) 146 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * weight 147 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - weight) 148 | 149 | # tmp = tmp.loc[tmp.bin==0] 150 | tmp = tmp.loc[tmp.bin.isin([0, 1])] 151 | #print(tmp.bin.value_counts()) 152 | tmp = tmp.groupby('app_id').mean() 153 | 154 | # print(tmp.shape) 155 | # print(tmp.label.head(3)) 156 | tmp.label = tmp.label.astype(int) 157 | # print(tmp.shape) 158 | score_list = accuracy(tmp) 159 | logger.info(f'weight:{weight}, score_list:{score_list}. base_train:{base_train}, File:{file}') 160 | total = score_list[1] 161 | score[weight] = total 162 | 163 | logger.info(f'Score list for file:{file}\n{score}') 164 | 165 | base_score = list(score.values())[-1] 166 | 167 | score = sorted(score.items(), key=lambda kv: kv[1]) 168 | best_score = score[-1][-1] 169 | best_weight = score[-1][0] 170 | grow = best_score-base_score 171 | 172 | logger.info(f'====best_weight:{best_weight:3.2}, best_score:{best_score:6.5f}/{grow:6.5f},base_train:{base_train},File:{file}') 173 | return best_weight 174 | 175 | 176 | def compare(file='./output/sub/80000_v36_07_bt_True_mean_top2_000_865700.csv'): 177 | df = pd.read_csv(file) 178 | 179 | from spider.mi import get_final_feature 180 | final = get_final_feature() 181 | 182 | df = pd.merge(final, df, how='left', on='id') 183 | 184 | def check(row): 185 | if len(str(row.type_id)) == 0: 186 | return None 187 | 188 | label_list = row.type_id.split('|') 189 | 190 | return str(row.label1) in label_list or str(row.label2) in label_list 191 | 192 | df['is_corr'] = df.apply(lambda row: check(row), axis=1) 193 | 194 | print(df.shape, '\n', df.is_corr.value_counts()) 195 | df = df.loc[df.is_corr == False] 196 | 197 | type_name = get_app_type() 198 | type_name = type_name.set_index('type_id') 199 | type_name.index = type_name.index.astype(str) 200 | 201 | df.label1 = df.label1.astype(str).replace(type_name.to_dict()['type_name']) 202 | df.label2 = df.label2.astype(str).replace(type_name.to_dict()['type_name']) 203 | df.type_id = df.type_id.astype(str).replace(type_name.to_dict()['type_name']) 204 | 205 | print(df['from'].value_counts()) 206 | 207 | return df 208 | 209 | @timed() 210 | def main(): 211 | for top in [4]: 212 | for weight in [0]: 213 | version = get_args().version 214 | with timed_bolck(f'Cal sub for top:{top}, weight:{weight:3.2f}, version:{version}'): 215 | for base_train in [True]: 216 | 217 | file_list_1 = get_file_list('v36', top) 218 | file_list_2 = get_file_list('v43', top) 219 | file_list_3 = get_file_list('v72', top) 220 | file_list_4 = get_file_list('v73', top) 221 | 222 | 223 | file_list = file_list_1 + file_list_2 + file_list_3 + file_list_4 224 | logger.info(f'File List:{file_list}') 225 | 226 | res = get_feature_oof(file_list, weight, base_train) 227 | 228 | train = res.loc[res.label != '0'] 229 | score_list = accuracy(train) 230 | total = score_list[1] 231 | 232 | res.to_csv(f'./output/{version}_bt_{base_train}_ex_change_file_top{top}_w{weight}_{int(total * 10 ** 6):06}.csv') 233 | file_name = f'{version}_{len(file_list):02}_bt_{base_train}_mean_top{top}_{int(weight * 100):03}_{int(total * 10 ** 6):06}.csv' 234 | res = gen_sub_file(res.loc[res.label == '0'], file_name) 235 | # logger.info(f'Sub file save to:{file_name}') 236 | 237 | 238 | if __name__== '__main__': 239 | FUNCTION_MAP = {'main': main, } 240 | 241 | args = get_args() 242 | 243 | func = FUNCTION_MAP[args.command] 244 | func() 245 | 246 | 247 | 248 | """ 249 | nohup python -u ./core/ensemble.py main >> ensemble.log 2>&1 & 250 | """ 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /core/ensemble_new.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from core.conf import * 4 | from core.feature import * 5 | 6 | static_list = [ 7 | # './output/stacking/v6_0_0.804024_6006_10605_b1_e1_m50.h5', 8 | # './output/stacking/v6_0_0.804532_6006_10605_b2_e1_m50.h5', 9 | # './output/stacking/v6_1_0.789411_5918_10451_b2_e1_m50.h5', 10 | # './output/stacking/v6_1_0.792143_5918_10451_b1_e1_m50.h5', 11 | # './output/stacking/v6_2_0.790876_6237_11068_b1_e1_m50.h5', 12 | # './output/stacking/v6_2_0.791542_6237_11068_b1_e1_m50.h5', 13 | # './output/stacking/v6_3_0.799421_5996_10562_b1_e1_m50.h5', 14 | # './output/stacking/v6_3_0.801635_5996_10562_b1_e1_m50.h5', 15 | # './output/stacking/v6_4_0.765271_6977_12388_b4_e1_m20.h5', 16 | # './output/stacking/v6_4_0.766215_6977_06977_b0_e1_m20.h5', 17 | ] 18 | @lru_cache() 19 | def get_top_file(fold,version): 20 | from glob import glob 21 | file_list = sorted(glob(f'./output/stacking/{version}_{fold}_*.h5'), reverse=True) 22 | 23 | if static_list: 24 | file_list = [ file for file in file_list if file in static_list] 25 | return file_list 26 | 27 | @lru_cache() 28 | def get_file_list(version, top=2,): 29 | file_list = [] 30 | for fold in range(5): 31 | tmp = get_top_file(fold, version) 32 | if len(tmp) < top: 33 | logger.warning(f'At least need {top} files for fold:{fold}') 34 | file_list = file_list + tmp[:top] 35 | return tuple(file_list) 36 | 37 | @lru_cache() 38 | @timed() 39 | def get_feature_oof(file_list, weight=1,base_train=True): 40 | 41 | train_list = [] 42 | test_list = [] 43 | 44 | for file in tqdm(file_list,f'gen oof from {len(file_list)} files'): 45 | cur_weight = weight if weight > 0 else get_best_weight(file, base_train=base_train) 46 | 47 | #Train begin 48 | tmp = pd.read_hdf(file, 'train') 49 | col_list = tmp.columns[:num_classes] 50 | tmp['app_id'] = tmp.index.str[:32].values 51 | tmp['bin'] = tmp.index.str[-1].values.astype(int) 52 | tmp = tmp.sort_values(['app_id', 'bin', 'label']) 53 | tmp = tmp.drop_duplicates(['app_id', 'bin']) 54 | 55 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight 56 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight) 57 | tmp.label = tmp.label.astype(int) 58 | tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean() 59 | 60 | train_list.append(tmp) 61 | 62 | #Test begin 63 | tmp = pd.read_hdf(file, 'test') 64 | tmp['app_id'] = tmp.index.str[:32].values 65 | tmp['bin'] = tmp.index.str[-1].values.astype(int) 66 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight 67 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight) 68 | tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean() 69 | test_list.append(tmp) 70 | 71 | train = pd.concat(train_list) 72 | 73 | test = pd.concat(test_list) 74 | 75 | oof = pd.concat([train, test]) 76 | print('oof, before=', oof.shape) 77 | oof = oof.groupby(oof.index).mean() 78 | print('oof, after=', oof.shape) 79 | del oof['bin'] 80 | oof.label = oof.label.fillna(0).astype(int).astype(str) 81 | return oof 82 | 83 | @timed() 84 | def gen_sub_file(res, file_name, topn=2): 85 | res = res.copy() 86 | res_raw = res.copy() 87 | 88 | for i in tqdm(range(1, 1+topn), desc=f'Cal label#1-{topn} value for res:{res.shape}'): 89 | res.loc[:, f'label{i}'] = res.iloc[:, :num_classes].idxmax(axis=1) 90 | res_raw.loc[:, f'label{i}'] = res.loc[:, f'label{i}'] 91 | 92 | for index, col in res[f'label{i}'].items(): 93 | res.loc[index, col] = np.nan 94 | 95 | 96 | if file_name: 97 | from spider.mi import get_train_ph2_index 98 | train_ph2 = get_train_ph2_index() 99 | 100 | res_bk = res.copy().loc[~res.index.str[6:].isin(train_ph2.id.str[6:].values)] 101 | for res in [res, res_bk]: 102 | res.index.name = 'id' 103 | sub_file = f'./output/sub/{len(res)}_{file_name}' 104 | res[['label1', 'label2']].to_csv(sub_file) 105 | logger.info(f'Sub file save to :{sub_file}') 106 | 107 | return res_raw 108 | 109 | 110 | 111 | @timed() 112 | def get_best_weight(file, base_train): 113 | import pandas as pd 114 | if base_train: 115 | df = pd.read_hdf(file, 'train') 116 | else: 117 | df = pd.read_hdf(file, 'test') 118 | 119 | from spider.mi import get_train_ph2_index 120 | ph2_train = get_train_ph2_index() 121 | ph2_train = ph2_train.set_index('id') 122 | df = df.loc[pd.Series(df.index).str[:32].isin(ph2_train.index).values] 123 | df['label'] = ph2_train.loc[pd.Series(df.index).str[:32]].type_id.str[:6].values.astype(int) 124 | 125 | 126 | df['bin'] = df.index.str[-1].astype(int) 127 | 128 | col_list = df.columns[:num_classes] 129 | #print(col_list) 130 | df['bin'] = df.index.str[-1].astype(int) 131 | df['app_id'] = df.index.str[:32] 132 | 133 | if len(df.loc[df.bin==1]) ==0 : 134 | return 1 135 | 136 | print(df.bin.value_counts()) 137 | df = df.sort_values(['app_id', 'bin', 'label']) 138 | df = df.drop_duplicates(['app_id', 'bin']) 139 | 140 | score ={} 141 | 142 | for weight in tqdm(np.arange(0.7, 1.01, 0.05), desc=f'Cal best for {file}'): 143 | weight = round(weight, 2) 144 | tmp = df.copy() 145 | # print(tmp.label.head(3)) 146 | tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * weight 147 | tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - weight) 148 | 149 | # tmp = tmp.loc[tmp.bin==0] 150 | tmp = tmp.loc[tmp.bin.isin([0, 1])] 151 | #print(tmp.bin.value_counts()) 152 | tmp = tmp.groupby('app_id').mean() 153 | 154 | # print(tmp.shape) 155 | # print(tmp.label.head(3)) 156 | tmp.label = tmp.label.astype(int) 157 | # print(tmp.shape) 158 | score_list = accuracy(tmp) 159 | logger.info(f'weight:{weight}, score_list:{score_list}. base_train:{base_train}, File:{file}') 160 | total = score_list[1] 161 | score[weight] = total 162 | 163 | logger.info(f'Score list for file:{file}\n{score}') 164 | 165 | base_score = list(score.values())[-1] 166 | 167 | score = sorted(score.items(), key=lambda kv: kv[1]) 168 | best_score = score[-1][-1] 169 | best_weight = score[-1][0] 170 | grow = best_score-base_score 171 | 172 | logger.info(f'====best_weight:{best_weight:3.2}, best_score:{best_score:6.5f}/{grow:6.5f},base_train:{base_train},File:{file}') 173 | return best_weight 174 | 175 | 176 | def compare(file='./output/sub/80000_v36_07_bt_True_mean_top2_000_865700.csv'): 177 | df = pd.read_csv(file) 178 | 179 | from spider.mi import get_final_feature 180 | final = get_final_feature() 181 | 182 | df = pd.merge(final, df, how='left', on='id') 183 | 184 | def check(row): 185 | if len(str(row.type_id)) == 0: 186 | return None 187 | 188 | label_list = row.type_id.split('|') 189 | 190 | return str(row.label1) in label_list or str(row.label2) in label_list 191 | 192 | df['is_corr'] = df.apply(lambda row: check(row), axis=1) 193 | 194 | print(df.shape, '\n', df.is_corr.value_counts()) 195 | df = df.loc[df.is_corr == False] 196 | 197 | type_name = get_app_type() 198 | type_name = type_name.set_index('type_id') 199 | type_name.index = type_name.index.astype(str) 200 | 201 | df.label1 = df.label1.astype(str).replace(type_name.to_dict()['type_name']) 202 | df.label2 = df.label2.astype(str).replace(type_name.to_dict()['type_name']) 203 | df.type_id = df.type_id.astype(str).replace(type_name.to_dict()['type_name']) 204 | 205 | print(df['from'].value_counts()) 206 | 207 | return df 208 | 209 | @timed() 210 | @file_cache() 211 | def get_oof_version(version, top, weight): 212 | 213 | 214 | with timed_bolck(f'Cal sub for top:{top}, weight:{weight:3.2f}, version:{version}'): 215 | for base_train in [True]: 216 | file_list = get_file_list(version, top) 217 | 218 | # file_list = file_list_1 + file_list_2 + file_list_3 + file_list_4 219 | logger.info(f'File List {version} :{file_list}') 220 | 221 | res = get_feature_oof(file_list, weight, base_train) 222 | 223 | return res#, len(file_list) 224 | 225 | @timed() 226 | def main(): 227 | oof_list = [] 228 | file_cnt = 0 229 | top = 4 230 | weight = 0 231 | oof_weight_list = [] 232 | for version, w in zip (['v36', 'v43','v72','v73','v74','v75'], [0.7 ,0.7,0.8,1,1,1 ] ): 233 | #version = get_args().version 234 | 235 | res = get_oof_version(version, top, weight) 236 | #Align all the porbiblity to 1 237 | 238 | 239 | train = res.loc[res.label != '0'] 240 | # score_list = accuracy(train) 241 | # #oof_weight_list.append((score_list[1])) 242 | # logger.info(f'Score for train{train.shape}/{res.shape}:{version}:{score_list}') 243 | 244 | res.iloc[:, :-1] = w * res.iloc[:, :-1].apply(lambda row: row / row.sum(), axis=1) 245 | 246 | oof_list.append(res) 247 | 248 | 249 | oof = pd.concat(oof_list) 250 | 251 | oof.to_pickle(f'./output/{file_cnt:02}_{len(oof_list)}_tmp_res_val.pkl') 252 | 253 | label_raw = oof_list[0].label#.drop_duplicates() 254 | print('oof, final before=', oof.shape) 255 | oof = oof.groupby(oof.index).mean() 256 | #oof.iloc[:, :-1] = oof.iloc[:, :-1].apply(lambda row: row / row.sum(), axis=1) 257 | 258 | print('oof, final after=', oof.shape) 259 | oof['label'] = label_raw 260 | 261 | oof['label'] = oof['label'].fillna('0') 262 | 263 | res = oof 264 | train = res.loc[res.label != '0'] 265 | 266 | score_list = accuracy(train) 267 | total = score_list[1] 268 | logger.info(f'get the final score:{total} base on train:{train.shape}') 269 | 270 | ex_file = f'./output/{version}_bt_change_file_top{top}_w{weight}_{int(total * 10 ** 6):06}.csv' 271 | res.to_csv(ex_file) 272 | logger.info(f'Exchange file save to:{ex_file}') 273 | file_name = f'{version}_{file_cnt:02}_new_mean_top{top}_{int(weight * 100):03}_{int(total * 10 ** 6):06}.csv' 274 | res = gen_sub_file(res.loc[res.label == '0'], file_name) 275 | 276 | 277 | 278 | if __name__== '__main__': 279 | FUNCTION_MAP = {'main': main, } 280 | 281 | args = get_args() 282 | 283 | func = FUNCTION_MAP[args.command] 284 | func() 285 | 286 | 287 | 288 | """ 289 | nohup python -u ./core/ensemble_new.py main >> ensemble_final.log 2>&1 & 290 | """ 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /core/mini.py: -------------------------------------------------------------------------------- 1 | 2 | from file_cache.utils.util_log import * 3 | from core.conf import * 4 | 5 | vector_size = 200 6 | 7 | def gen_mini_embedding(wv_from_text, word_list): 8 | from multiprocessing.dummy import Pool 9 | 10 | from functools import partial 11 | 12 | partition_num = 8 13 | import math 14 | partition_length = math.ceil(len(word_list)/partition_num) 15 | 16 | partition_list = [ word_list[i:i+partition_length] for i in range(0, len(word_list), partition_length )] 17 | logger.debug(f'The word list split to {len(partition_list)} partitions:{[ len(partition) for partition in partition_list]}') 18 | thread_pool = Pool(processes=partition_num) 19 | process = partial(gen_mini_partition,wv_from_text=wv_from_text ) 20 | 21 | wv_list = thread_pool.map(process, partition_list) 22 | thread_pool.close(); thread_pool.join() 23 | 24 | del wv_from_text 25 | 26 | return pd.concat(wv_list) 27 | 28 | 29 | def compute_ngrams(word, min_n, max_n): 30 | # BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix 31 | extended_word = word 32 | ngrams = [] 33 | for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): 34 | for i in range(0, len(extended_word) - ngram_length + 1): 35 | ngrams.append(extended_word[i:i + ngram_length]) 36 | res = list(set(ngrams)) 37 | return res 38 | 39 | def wordVec(word,wv_from_text:dict,min_n = 1, max_n = 3): 40 | ''' 41 | ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量 42 | ''' 43 | 44 | # 如果在词典之中,直接返回词向量 45 | if word in wv_from_text.index: 46 | return wv_from_text.loc[word] 47 | else: 48 | logger.warning(f'Cannot find this word directly:{word}') 49 | word_size = vector_size 50 | # 计算word的ngrams词组 51 | ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n) 52 | # 不在词典的情况下 53 | word_vec = np.zeros(word_size, dtype=np.float32) 54 | ngrams_found = 0 55 | ngrams_single = [ng for ng in ngrams if len(ng) == 1] 56 | ngrams_more = [ng for ng in ngrams if len(ng) > 1] 57 | # 先只接受2个单词长度以上的词向量 58 | for ngram in ngrams_more: 59 | if ngram in wv_from_text.index: 60 | word_vec += wv_from_text.loc[ngram] 61 | ngrams_found += 1 62 | #print(ngram) 63 | # 如果,没有匹配到,那么最后是考虑单个词向量 64 | if ngrams_found == 0: 65 | for ngram in ngrams_single: 66 | if ngram in wv_from_text.index: 67 | word_vec += wv_from_text.loc[ngram] 68 | ngrams_found += 1 69 | elif ngram.lower() in wv_from_text.index: 70 | word_vec += wv_from_text.loc[ngram.lower()] 71 | ngrams_found += 1 72 | else: 73 | logger.warning(f'Can not find {ngram} in wv') 74 | if ngrams_found > 0: 75 | return word_vec / max(1, ngrams_found) 76 | else: 77 | logger.error('all ngrams for word "%s" absent from model' % word) 78 | return None 79 | 80 | @timed() 81 | def gen_mini_partition(word_set, wv_from_text): 82 | 83 | mini = pd.DataFrame(np.zeros((len(word_set), vector_size)), index=word_set, ) 84 | # for i in tqdm(range(len(word_set))): 85 | for i in range(len(word_set)): 86 | word = word_set[i] 87 | vector = wordVec(word, wv_from_text, 1, 3) 88 | if vector is not None: 89 | mini.loc[word] = vector 90 | else: 91 | logger.debug(f'Can not find vec for:{len(word)},{word}') 92 | mini.loc[word] = np.zeros(vector_size) 93 | 94 | return mini 95 | 96 | @timed() 97 | def gen_tx_mini(): 98 | #word2vec_tx, vector_size = './input/Tencent_AILab_ChineseEmbedding.txt', 200 99 | 100 | from core.feature import load_embedding, get_word_cnt 101 | 102 | embed = load_embedding(word2vec_tx, type='txt') 103 | word_list = get_word_cnt() 104 | logger.info(word_list[:5]) 105 | data = gen_mini_embedding(embed, word_list.word.values) 106 | 107 | logger.debug(f'The length of the vector is {data.shape}') 108 | 109 | fname = "./input/mini_tx.kv" 110 | np.savetxt(fname, data.reset_index().values, 111 | delimiter=" ", 112 | header="{} {}".format(len(data), len(data.columns)), 113 | comments="", 114 | fmt=["%s"] + ["%.6f"] * len(data.columns)) 115 | 116 | logger.info(f'Mini dict save to {fname}') 117 | 118 | if __name__ == '__main__': 119 | from fire import Fire 120 | Fire() 121 | 122 | """ 123 | nohup python -u core/mini.py gen_tx_mini > mini.log 2>&1 & 124 | """ -------------------------------------------------------------------------------- /core/split.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import StratifiedKFold 2 | 3 | from core.feature import * 4 | 5 | 6 | def get_split_group(random_state=2019): 7 | apptype_train = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\t', 8 | names=['app_id', 'type_id', 'app_des'], 9 | quoting=3, 10 | ) 11 | 12 | apptype_train = apptype_train.sort_values('app_id') 13 | folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) 14 | 15 | gp_list = list(folds.split(apptype_train, apptype_train.type_id.astype('category').cat.codes)) 16 | 17 | train_list = [apptype_train.iloc[gp, 0].values for gp, _ in gp_list] 18 | 19 | val_list = [apptype_train.iloc[gp, 0].values for _, gp in gp_list] 20 | 21 | return train_list, val_list 22 | 23 | 24 | @timed() 25 | def split_df_by_index_no_bin(df, fold): 26 | # 27 | # sn = pd.Series(df.index).str[-1].astype(int) 28 | df = pd.Series(df.index).str[:32] 29 | 30 | 31 | 32 | train_list, val_list = get_split_group() 33 | train_gp = train_list[fold] 34 | val_gp = val_list[fold] 35 | 36 | return df.loc[(df.isin(train_gp))].index.values, \ 37 | df.loc[(df.isin(val_gp)) ].index.values 38 | 39 | 40 | def split_df_by_index(df, fold): 41 | index = df.index 42 | app_id = pd.Series(index).apply(lambda val: val.split('_')[0]) 43 | bin = pd.Series(index).apply(lambda val: val.split('_')[-1]).astype(int) 44 | df = pd.concat([app_id,bin], axis=1) 45 | df.columns = ['app_id', 'bin'] 46 | 47 | #print(df.shape, df.head) 48 | train_list, val_list = get_split_group() 49 | train_gp = train_list[fold] 50 | val_gp = val_list[fold] 51 | 52 | train_bin = list(range(get_args().max_bin+1)) 53 | 54 | val_bin= train_bin #[0,1] 55 | 56 | logger.info(f'split base on: train_bin:{train_bin}, val_bin:{val_bin}') 57 | logger.info(f'The original bin_id distribution in train data set:\n {df.loc[(df.app_id.isin(train_bin))].bin.value_counts()}') 58 | 59 | logger.info(f'The original bin_id distribution in val data set:\n{ df.loc[(df.app_id.isin(val_gp))].bin.value_counts() } ') 60 | 61 | return df.loc[(df.app_id.isin(train_gp)) & (df.bin.isin(train_bin))].index.values, \ 62 | df.loc[(df.app_id.isin(val_gp)) & (df.bin.isin(val_bin))].index.values 63 | 64 | 65 | if __name__ == '__main__': 66 | for random in range(2019, 2099): 67 | train_list, val_list = get_split_group(random) 68 | gp = [len(val) for val in val_list] 69 | print(np.array(gp).std(), gp, random) 70 | -------------------------------------------------------------------------------- /core/xlnet.py: -------------------------------------------------------------------------------- 1 | 2 | from multiprocessing import Process 3 | 4 | 5 | 6 | 7 | 8 | from core.feature_xlnet import * 9 | from core.conf import * 10 | 11 | import os 12 | 13 | 14 | os.environ['TF_KERAS'] = '1' 15 | 16 | oof_prefix = get_args().version 17 | SEQ_LEN = get_args().seq_len #randrange(128, 180) #-randrange(0, 5)*8 18 | BATCH_SIZE = get_args().batch_size 19 | 20 | #Batch size, MAX_len+ex_length, Manual, Manual GP feature cnt, frac 21 | @lru_cache() 22 | @timed() 23 | def get_train_test_bert(): 24 | 25 | frac = get_args().frac 26 | max_bin = get_args().max_bin 27 | min_len = int(SEQ_LEN*get_args().min_len_ratio) 28 | 29 | data = get_feature_xlnet(SEQ_LEN) 30 | 31 | #Keep all the bin group, if it's test data 32 | data = data.loc[(data.bin<=max_bin) | (pd.isna(data.type_id))] 33 | 34 | with timed_bolck(f'Remove gan data, and len is less then {min_len}'): 35 | data = data.loc[ (data.bin == 0) | (data['len_'] >= min_len) ] 36 | logger.info(f'Train max_bin:{max_bin},Total Bin distribution:\n{data.bin.value_counts().sort_index()}') 37 | 38 | data = data.sort_index() 39 | logger.info(f'Head of the data:\n, {data.iloc[:3,:3]}') 40 | 41 | train_data = data.loc[pd.notna(data.type_id)].sample(frac=frac, random_state=2019) 42 | labels = train_data.type_id.values.tolist() 43 | logger.info(f'Train Bin distribution:\n{train_data.bin.value_counts().sort_index()}') 44 | 45 | test_data = data.loc[pd.isna(data.type_id)].sample(frac=1, random_state=2019) 46 | 47 | trial = get_args().trial 48 | logger.info(f'Test Bin distribution#{trial}:\n{test_data.bin.value_counts().sort_index()}') 49 | 50 | if trial > 0: 51 | test_data = test_data.loc[test_data.index.str[-1]=='0'] 52 | 53 | 54 | logger.info(f'Train:{train_data.shape} Test#{trial}:{test_data.shape}, frac:{frac}') 55 | 56 | feature_col = [col for col in data.columns if col.startswith('fea_') or col.startswith('bert_')] 57 | 58 | label2id, id2label = get_label_id() 59 | #word2id = get_word2id() 60 | 61 | # Encode input words and labels 62 | X = train_data.loc[:, feature_col] 63 | Y = [label2id[label] for label in labels] 64 | 65 | 66 | X_test = test_data.loc[:, feature_col] 67 | 68 | 69 | return X, pd.Series(Y, index=train_data.index), X_test 70 | 71 | 72 | # X, y, X_test = get_train_test_bert(0.1) 73 | # 74 | # 75 | # train_x, train_y = load_data(train_path) 76 | # test_x, test_y = load_data(test_path) 77 | 78 | def boost_train(boost=10): 79 | for _ in range(boost): 80 | p = Process(target=train_base) 81 | p.start() 82 | p.join() 83 | 84 | 85 | @timed() 86 | def filter_short_desc(X, y): 87 | X = X.copy().reset_index() 88 | bert_cols = [col for col in X.columns if str(col).startswith('bert_')] 89 | bert = X.loc[:, bert_cols] 90 | bert_len = bert.where(bert > 0).count(axis=1) 91 | old_len = len(bert_len) 92 | min_len = int(SEQ_LEN*get_args().min_len_ratio) 93 | bert_len = bert_len.loc[bert_len >= min_len] 94 | logger.info(f'Filter {old_len - len(bert_len)} records from {old_len} by threshold {min_len}') 95 | 96 | return X.iloc[bert_len.index], y[bert_len.index] 97 | 98 | 99 | @timed() 100 | def train_base(): 101 | args = get_args() 102 | #frac = args.frac 103 | fold = args.fold 104 | EPOCHS = args.epochs 105 | 106 | 107 | LR = 2e-5 108 | 109 | BATCH_SIZE = get_args().batch_size 110 | with timed_bolck(f'Prepare train data#{BATCH_SIZE}, LR:{LR}'): 111 | X, y, _ = get_train_test_bert() 112 | 113 | ##Begin to define model 114 | from keras_bert import load_trained_model_from_checkpoint 115 | 116 | 117 | from keras_xlnet.backend import keras 118 | from keras_bert.layers import Extract 119 | from keras_xlnet import PretrainedList, get_pretrained_paths 120 | from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI 121 | 122 | checkpoint_path = xlnet_path 123 | logger.info(f'Start to train base on checkpoint:{checkpoint_path}') 124 | 125 | # EPOCH = 10 126 | # BATCH_SIZE = 64 127 | # SEQ_LEN = 50 128 | # MODEL_NAME = 'SST-2.h5' 129 | # 130 | # current_path = os.path.dirname(os.path.abspath(__file__)) 131 | # train_path = os.path.join(current_path, 'train.tsv') 132 | # dev_path = os.path.join(current_path, 'dev.tsv') 133 | 134 | # Load pretrained model 135 | 136 | #vocab_path = os.path.join(checkpoint_path, 'spiece.model') 137 | config_path = os.path.join(checkpoint_path, 'xlnet_config.json') 138 | model_path = os.path.join(checkpoint_path, 'xlnet_model.ckpt') 139 | 140 | #tokenizer = Tokenizer(paths.vocab) 141 | model = load_trained_model_from_checkpoint( 142 | config_path=config_path, 143 | checkpoint_path=model_path, 144 | batch_size=BATCH_SIZE, 145 | memory_len=0, 146 | target_len=SEQ_LEN, 147 | in_train_phase=False, 148 | attention_type=ATTENTION_TYPE_BI, 149 | ) 150 | 151 | # Build classification model 152 | last = Extract(index=-1, name='Extract')(model.output) 153 | dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last) 154 | dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense) 155 | output = keras.layers.Dense(units=num_classes, activation='softmax', name='Softmax')(dropout) 156 | model = keras.models.Model(inputs=model.inputs, outputs=output) 157 | model.summary() 158 | 159 | model.compile( 160 | optimizer=keras.optimizers.Adam(lr=LR), 161 | loss='categorical_crossentropy', 162 | metrics=['accuracy'], 163 | ) 164 | 165 | ##End to define model 166 | 167 | input1_col = [col for col in X.columns if str(col).startswith('bert_')] 168 | #input2_col = [col for col in X.columns if str(col).startswith('fea_')] 169 | #max_words = len(input1_col) 170 | model #= get_model(max_words) 171 | 172 | #get_feature_manual.cache_clear() 173 | Y_cat = keras.utils.to_categorical(y, num_classes=num_classes) 174 | #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) 175 | 176 | with timed_bolck(f'Training#{fold}'): 177 | from core.split import split_df_by_index 178 | train_idx, test_idx = split_df_by_index(X,fold) 179 | 180 | trunc_len_tran = 64*(len(train_idx)//64) 181 | trunc_len_val = 64 * (len(test_idx) // 64) 182 | 183 | train_idx = train_idx[:trunc_len_tran] 184 | test_idx = test_idx[:trunc_len_val] 185 | 186 | logger.info(f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}') 187 | train_x, train_y, val_x, val_y = \ 188 | X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx] 189 | 190 | logger.info(f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ') 191 | 192 | #train_x, train_y = filter_short_desc(train_x, train_y) 193 | 194 | input1 = train_x.loc[:, input1_col]#.astype(np.float32) 195 | input2 = np.zeros_like(input1)#.astype(np.int8) 196 | input3 = np.zeros_like(input1.iloc[:, :1]) 197 | 198 | val1 = val_x.loc[:, input1_col] 199 | val2 = np.zeros_like(val1) 200 | val3 = np.zeros_like(val1.iloc[:,:1]) 201 | 202 | logger.info(f'NN train_x:{train_x[:3]}') 203 | min_len_ratio = get_args().min_len_ratio 204 | max_bin = get_args().max_bin 205 | logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}, SEQ_LEN:{SEQ_LEN}, min_len_ratio:{min_len_ratio}, bin:{max_bin} ') 206 | 207 | from keras_bert import get_custom_objects 208 | from tensorflow.python.keras.callbacks import EarlyStopping 209 | import tensorflow as tf 210 | 211 | es = EarlyStopping(monitor='val_acc',patience=2, verbose=1) 212 | with tf.keras.utils.custom_object_scope(get_custom_objects()): 213 | his = model.fit([input1, input2, input3], train_y, 214 | validation_data = ([val1, val2, val3 ], 215 | val_y), 216 | epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, 217 | callbacks=[Cal_acc( val_x, y.iloc[test_idx] ), es] 218 | #steps_per_epoch=1000, validation_steps=10 219 | ) 220 | 221 | 222 | 223 | #gen_sub(model, X_test, sn) 224 | 225 | return his 226 | 227 | from tensorflow.python.keras.callbacks import Callback 228 | 229 | 230 | class Cal_acc(Callback): 231 | 232 | def __init__(self, val_x, y): 233 | super(Cal_acc, self).__init__() 234 | self.val_x , self.y = val_x, y 235 | self.min_len = int(SEQ_LEN*get_args().min_len_ratio) 236 | self.max_bin = get_args().max_bin 237 | self.fold = get_args().fold 238 | self.threshold = 0 239 | self.feature_len = self.val_x.shape[1] 240 | self.cur_epoch = 0 241 | self.version = get_args().version 242 | self.trial = get_args().trial 243 | 244 | self.max_score = 0 245 | 246 | self.score_list = np.zeros(get_args().epochs) 247 | self.gen_file = False 248 | 249 | import time, os 250 | self.batch_id = round(time.time()) 251 | self.model_folder = f'./output/model/{self.batch_id}/' 252 | 253 | os.makedirs(self.model_folder) 254 | 255 | 256 | #logger.info(f'Cal_acc base on X:{self.X.shape}, Y:{self.y.shape}') 257 | 258 | #@timed() 259 | def cal_acc(self): 260 | input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')] 261 | #input2_col = [col for col in self.val_x.columns if str(col).startswith('fea_')] 262 | #model = self.model 263 | #tmp_val = 264 | tmp_y = self.y 265 | 266 | input1 = self.val_x.loc[:,input1_col] # .astype(np.float32) 267 | input2 = np.zeros_like(input1) # .astype(np.int8) 268 | input3 = np.zeros_like(input1.iloc[:, :1]) 269 | 270 | val = self.model.predict([input1, input2, input3]) 271 | 272 | label2id, id2label = get_label_id() 273 | val = pd.DataFrame(val, columns=label2id.keys(), index=input1.index) 274 | val['label'] = tmp_y.astype(int).replace(id2label).astype(int) 275 | val['bin'] = pd.Series(val.index).str[-1].values.astype(int) 276 | #logger.info(f'Head val#label:\n{val.label.head()}') 277 | res_val = val.copy() 278 | # res_val.to_pickle(f'./output/tmp_res_val.pkl') 279 | # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl') 280 | 281 | num_labels = 10 282 | df_score = val.loc[val.bin==0] 283 | score_list = accuracy(df_score, num_labels, f'no{self.cur_epoch},b{self.max_bin},{self.version}') 284 | 285 | logger.info(f'{len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}') 286 | 287 | return score_list,res_val 288 | 289 | @timed() 290 | def cal_acc_ex(self): 291 | input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')] 292 | 293 | if self.trial==0: 294 | check_type_list =['val'] 295 | for type_ in tqdm(check_type_list,desc='cal_acc_ex'): 296 | tmp_val ,tmp_y = self.get_tmp_val_test(type_) 297 | #tmp_val = tmp_val.loc[:, input1_col] 298 | 299 | input1 = tmp_val.loc[:, input1_col] # .astype(np.float32) 300 | input2 = np.zeros_like(input1) # .astype(np.int8) 301 | input3 = np.zeros_like(input1.iloc[:, :1]) 302 | logger.info(f'{input1.shape},{input2.shape},{input3.shape}') 303 | logger.info(input1[:3]) 304 | # logger.info(input2[:3]) 305 | # logger.info(input3[:3]) 306 | val = self.model.predict([input1,input2,input3]) 307 | 308 | label2id, id2label = get_label_id() 309 | val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index) 310 | val['label'] = tmp_y.astype(int).replace(id2label).astype(int) 311 | val['bin'] = pd.Series(val.index).str[-1].values.astype(int) 312 | # logger.info(f'Head val#label:\n{val.label.head()}') 313 | res_val = val.copy() 314 | # res_val.to_pickle(f'./output/tmp_res_val.pkl') 315 | # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl') 316 | 317 | num_labels = 10 318 | df_score = val.loc[val.bin == 0] 319 | score_list = accuracy(df_score, num_labels, f'ex{self.cur_epoch},{self.version},b{self.max_bin},{type_}') 320 | 321 | logger.info(f'===cal_acc_ex{self.cur_epoch}:{type_}==={len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}') 322 | 323 | return score_list, res_val 324 | 325 | 326 | @lru_cache() 327 | @timed() 328 | def get_tmp_val_test(self, type_): 329 | _, _, test_all = get_train_test_bert() 330 | 331 | test = test_all.loc[pd.Series(test_all.index).str.startswith(type_).values] 332 | 333 | test = test.loc[(pd.Series(test.index).str[-1]=='0').values] 334 | 335 | logger.info(f'Split {type_}, {len(test)} rows from {len(test_all)}') 336 | 337 | test=test.copy() 338 | type_ = 'x'*6 + pd.Series(test.index).str[:6] 339 | test.index = 'x'*6 + pd.Series(test.index).str[6:] 340 | 341 | from spider.mi import get_train_ph2_index 342 | train_ph2 = get_train_ph2_index() 343 | #final = final.loc[final.type_id.str.len() >= 1] 344 | train_ph2.index = 'x'*6 + train_ph2['id'].str[6:] 345 | #Align label with input test 346 | index_old = test.index.copy() 347 | test.index = pd.Series(test.index).apply(lambda val: val[:32]) 348 | 349 | label = train_ph2.type_id.loc[test.index.values].str[:6] #type_id len is 6 350 | 351 | #Rollback index change 352 | test.index = index_old 353 | label.index = index_old 354 | 355 | test = test.loc[pd.notna(label).values] 356 | label = label.dropna() 357 | print('test, label, type_', test.shape, label.shape, type_.shape) 358 | return test, label#, type_ 359 | 360 | 361 | def on_train_end(self, logs=None): 362 | grow= max(self.score_list) - self.threshold 363 | cut_ratio = get_args().cut_ratio 364 | logger.info(f'Train END: Fold:{self.fold}, max:{max(self.score_list):7.6f}/{grow:+6.5f}, at {np.argmax(self.score_list)}/{len(self.score_list)-1}, his:{self.score_list}, max_bin:{self.max_bin}, cut:{cut_ratio}, min_len:{self.min_len:03}, SEQ_LEN:{SEQ_LEN:03}, threshold:{self.threshold:7.6f}, gen_file:{self.gen_file}') 365 | logger.info(f'Input args:{get_args()}') 366 | 367 | def on_epoch_end(self, epoch, logs=None): 368 | self.cur_epoch = epoch 369 | print('\n') 370 | _, _ = self.cal_acc_ex() 371 | 372 | if self.trial > 0: 373 | return 0 374 | else: 375 | score_list, val = self.cal_acc() 376 | total = score_list[1] 377 | 378 | self.score_list[epoch] = round(total, 6) 379 | #threshold_map = {0:0.785, 1:0.77, 2:0.77, 3:0.77, 4:0.78} 380 | top_cnt =2 381 | top_score = self._get_top_score(self.fold)[:top_cnt] 382 | self.threshold = top_score[-1] if len(top_score) == top_cnt else 0 383 | logger.info(f'The top#{top_cnt} score for max_bin:{get_args().max_bin}, epoch:{epoch}, oof:{oof_prefix}, fold#{self.fold} is:{top_score}, cur_score:{total}, threshold:{self.threshold}') 384 | if ( round(total,4) > round(self.threshold,4) 385 | and (epoch>=3 or self.threshold > 0 or total>0.83 ) 386 | and total > max(self.max_score, 0.83) 387 | ) : 388 | #logger.info(f'Try to gen sub file for local score:{total}, and save to:{model_path}') 389 | self.gen_file=True 390 | grow = max(self.score_list) - self.threshold 391 | logger.info(f'Fold:{self.fold}, epoch:{epoch}, MAX:{max(self.score_list):7.6f}/{grow:+6.5f}, threshold:{self.threshold}, score_list:{self.score_list}' ) 392 | test = self.gen_sub(self.model, f'{self.feature_len}_{total:7.6f}_{epoch}_f{self.fold}') 393 | len_raw_val = len(val.loc[val.bin == 0]) 394 | min_len_ratio = get_args().min_len_ratio 395 | oof_file = f'./output/stacking/{oof_prefix}_{self.fold}_{total:7.6f}_{len_raw_val}_{len(val):05}_b{get_args().max_bin}_e{epoch}_{self.batch_id}_m{min_len_ratio:2.1f}_L{SEQ_LEN:03}_XL.h5' 396 | self.save_stack_feature(val, test, oof_file) 397 | else: 398 | logger.info(f'Epoch:{epoch}, only gen sub file if the local score >{self.threshold}, current score:{total}, threshold:{self.threshold}, max_score:{self.max_score}') 399 | 400 | self.max_score = max(self.max_score, total) 401 | 402 | logger.info(f'Epoch#{epoch} END,max_bin:{get_args().max_bin}, oof:{oof_prefix}, max:{self.max_score:6.5f}, score:{score_list}, Fold:{self.fold},') 403 | 404 | print('\n') 405 | 406 | return round(total, 5) 407 | 408 | @staticmethod 409 | @timed() 410 | def save_stack_feature(train: pd.DataFrame, test: pd.DataFrame, file_path): 411 | train.bin = train.bin.astype(int) 412 | test.bin = test.bin.astype(int) 413 | train.to_hdf(file_path, 'train', mode='a') 414 | test.to_hdf(file_path, 'test', mode='a') 415 | logger.info(f'OOF file save to :{file_path}') 416 | return train, test 417 | 418 | 419 | @timed() 420 | #./output/model/1562899782/model_6114_0.65403_2.h5 421 | def gen_sub(self, model , info='bert_' , partition_len = 5000): 422 | 423 | #frac = get_args().frac 424 | _, _, test = get_train_test_bert() 425 | 426 | label2id, id2label = get_label_id() 427 | input1_col = [col for col in test.columns if str(col).startswith('bert_')] 428 | input3_col = [col for col in test.columns if str(col).startswith('fea_')] 429 | 430 | logger.info(f'Input input1_col:{len(input1_col)}, input3_col:{len(input3_col)}') 431 | res_list = [] 432 | for sn in tqdm(range(1+ len(test)//partition_len), desc=f'{info}:sub:total:{len(test)},partition_len:{partition_len}'): 433 | tmp = test.iloc[sn*partition_len: (sn+1)*partition_len] 434 | #print('\nbegin tmp\n', tmp.iloc[:3,:3].head()) 435 | input1 = tmp.loc[:,input1_col] 436 | input2 = np.zeros_like(input1) # .astype(np.int8) 437 | input3 = np.zeros_like(input1.iloc[:, :1]) 438 | res = model.predict([ input1, input2, input3]) 439 | res = pd.DataFrame(res, columns=label2id.keys(), index=tmp.index) 440 | #print('\nend tmp\n', res.iloc[:3, :3].head()) 441 | res_list.append(res) 442 | 443 | res = pd.concat(res_list) 444 | res['bin'] = res.index.str[-1].values.astype(int) 445 | raw_predict = res.copy() 446 | 447 | with timed_bolck(f'Try to gen sub file for fold#{self.fold}'): 448 | #print('\nafter concat\n', res.iloc[:3, :3].head()) 449 | res['id'] = res.index 450 | res.index.name = 'id' 451 | # res.to_pickle(f'./output/tmp_sub.pkl') 452 | 453 | 454 | #print('\nend res\n', res.iloc[:3, :3].head()) 455 | 456 | 457 | 458 | res_mean = res.copy(deep=True) 459 | res_mean['id'] = res_mean.id.apply(lambda val: val.split('_')[0]) 460 | res_mean.index.name = 'index' 461 | res_select = res_mean.groupby('id')['bin'].agg({'bin_max': 'max'}) 462 | res_select.head() 463 | res_select = res_select.loc[res_select.bin_max == 3] 464 | res_mean = res_mean.loc[(res_mean.bin == 0) 465 | | ((res_mean.bin == 1) & (res_mean.id.isin(res_select.index))) 466 | ] 467 | logger.info(f'Try to cal avg for res_mean:\n{res_mean.bin.value_counts()}') 468 | res_mean_len = len(res_mean) 469 | res_mean = res_mean.groupby('id').mean().sort_index() 470 | del res_mean['bin'] 471 | 472 | 473 | res_0 = res.copy(deep=True) 474 | res_0 = res_0.loc[res_0.bin == 0] 475 | res_0.index = res_0.id.apply(lambda val: val.split('_')[0]) 476 | #print('\nres_0\n', res_0.loc[:, ['id', 'bin']].head(3)) 477 | res_0 = res_0.sort_index() 478 | res_0 = res_0.drop(columns=['id','bin'], axis=1, errors='ignore') 479 | 480 | for name, res in [('single',res_0), (f'mean_{res_mean_len}', res_mean)]: 481 | res = res.copy() 482 | #logger.info(f'{name} Check:\n{res.iloc[:3,:num_classes].sum(axis=1)}') 483 | 484 | res['label1'] = res.iloc[:, :num_classes].idxmax(axis=1) 485 | 486 | # Exclude top#1 487 | for index, col in res.label1.items(): 488 | res.loc[index, col] = np.nan 489 | 490 | res['label2'] = res.iloc[:, :num_classes].idxmax(axis=1) 491 | 492 | 493 | for col in ['label1','label2']: 494 | res[col] = res[col].replace(id2label) 495 | 496 | # info = info.replace('.','') 497 | # sub_file = f'./output/sub/v19_{info}_{name}.csv' 498 | # res[['label1', 'label2']].to_csv(sub_file) 499 | # logger.info(f'Sub file save to :{sub_file}') 500 | 501 | #logger.info(f'res_0 Check:\n{res_0.iloc[:3, :num_classes].sum(axis=1)}') 502 | 503 | return raw_predict #res.drop(columns=['id','bin'], axis=1, errors='ignore') 504 | 505 | @staticmethod 506 | def _get_top_score(fold): 507 | from glob import glob 508 | file_list = sorted(glob(f'./output/stacking/{oof_prefix}_{fold}_*.h5'), reverse=True) 509 | score_list = [float(file.split('_')[2].replace('.h5', '')) for file in file_list] 510 | logger.info(f'Score list for fold#{fold} is {score_list}') 511 | return score_list if score_list else [0] 512 | 513 | if __name__ == '__main__': 514 | FUNCTION_MAP = {'train_base': train_base, 515 | } 516 | 517 | args = get_args() 518 | 519 | func = FUNCTION_MAP[args.command] 520 | func() 521 | 522 | """ 523 | 524 | nohup python -u ./core/bert.py --frac=0.1 train_base > test.log 2>&1 & 525 | 526 | nohup python -u ./core/bert.py --fold=4 --max_bin=2 train_base > test_4.log 2>&1 & 527 | 528 | python -u ./core/bert.py --max_bin=2 train_base 529 | 530 | nohup python -u ./core/bert.py train_base > test.log 2>&1 & 531 | 532 | nohup python -u ./core/bert.py train_base > extend_bert_mean_bin_1.log 2>&1 & 533 | 534 | nohup python -u ./core/bert.py boost_train 10 >> boost_1.log 2>&1 & 535 | 536 | """ -------------------------------------------------------------------------------- /input/readme.txt: -------------------------------------------------------------------------------- 1 | 如果此文件夹是空,需要运行下列命令生成input文件 2 | 3 | # 爬取数据 4 | nohup python -u spider/mi.py bd > bd.log 2>&1& 5 | nohup python -u spider/mi.py wdj > wdj.log 2>&1& 6 | nohup python -u spider/mi.py xm > xm.log 2>&1& 7 | nohup python -u spider/mi.py 360 > 360.log 2>&1& 8 | 9 | nohup python -u spider/mi.py tx_pkg > tx_pkg.log 2>&1& 10 | nohup python -u spider/mi.py tx_name > tx_name.log 2>&1& 11 | 12 | nohup python -u spider/mi.py bdsj > bdsj.log 2>&1& 13 | 14 | 15 | # 生成input数据 16 | ./bin/clean.sh 17 | -------------------------------------------------------------------------------- /notebook/.ipynb_checkpoints/word_analysis_local-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "2019-07-04 22:42:21,041 util_log.py[128] INFO Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n", 13 | "2019-07-04 22:42:21,045 util_pandas.py[19] WARNING \"No such keys(s): 'display.height'\"\n" 14 | ] 15 | }, 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "yes\n", 21 | "/Users/lali2/Documents/workspace_py/xf_tag\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import sys\n", 27 | "import os\n", 28 | "os.chdir('../')\n", 29 | "\n", 30 | "\n", 31 | "import pandas as pd\n", 32 | "import numpy as np\n", 33 | "\n", 34 | "from bokeh.palettes import Category10\n", 35 | "\n", 36 | "\n", 37 | "from tqdm import tqdm\n", 38 | "\n", 39 | "\n", 40 | "from file_cache.utils.util_pandas import *\n", 41 | "from file_cache.cache import file_cache\n", 42 | "from functools import lru_cache\n", 43 | "from glob import glob\n", 44 | "\n", 45 | "%matplotlib inline\n", 46 | "from core.conf import *\n", 47 | "!pwd" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 62, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "(94354, 2)\n" 60 | ] 61 | }, 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | "\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | "
app_idapp_des
0BB29DA6F8167CFC99E0853741C4EB17B注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏,你用你自己的音乐比赛...
1BB2A78EA7AD4945EAF6E38997F6139A3定位试衣到家是一款基于地理位置,提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...
2BB2B1604CFA079C289FECF927DFBCE89想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质,邀您...
3BB2C7BD0B0623644183DAD08A89E1D90闽通宝手机客户端是基于移动互联网的,以公众出行服务为基础,贯彻绿色出行,低碳生活的理念,为出...
4BB2E1A8F56158E483D7461E930E6332F风靡全球的DIY照片桌面,干净、流畅,启动提速100,瞬间提升手机性能;更是一亿用户的共同选...
\n", 115 | "
" 116 | ], 117 | "text/plain": [ 118 | " app_id \\\n", 119 | "0 BB29DA6F8167CFC99E0853741C4EB17B \n", 120 | "1 BB2A78EA7AD4945EAF6E38997F6139A3 \n", 121 | "2 BB2B1604CFA079C289FECF927DFBCE89 \n", 122 | "3 BB2C7BD0B0623644183DAD08A89E1D90 \n", 123 | "4 BB2E1A8F56158E483D7461E930E6332F \n", 124 | "\n", 125 | " app_des \n", 126 | "0 注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏,你用你自己的音乐比赛... \n", 127 | "1 定位试衣到家是一款基于地理位置,提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家... \n", 128 | "2 想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质,邀您... \n", 129 | "3 闽通宝手机客户端是基于移动互联网的,以公众出行服务为基础,贯彻绿色出行,低碳生活的理念,为出... \n", 130 | "4 风靡全球的DIY照片桌面,干净、流畅,启动提速100,瞬间提升手机性能;更是一亿用户的共同选... " 131 | ] 132 | }, 133 | "execution_count": 62, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "get_word_cnt" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 63, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "(152, 2)\n" 152 | ] 153 | }, 154 | { 155 | "data": { 156 | "text/html": [ 157 | "
\n", 158 | "\n", 171 | "\n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | "
type_idtype_name
01401便捷生活
11402游戏
21403通讯社交
31404阅读
41405工作求职
\n", 207 | "
" 208 | ], 209 | "text/plain": [ 210 | " type_id type_name\n", 211 | "0 1401 便捷生活\n", 212 | "1 1402 游戏\n", 213 | "2 1403 通讯社交\n", 214 | "3 1404 阅读\n", 215 | "4 1405 工作求职" 216 | ] 217 | }, 218 | "execution_count": 63, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "app_type = pd.read_csv(f'{input_dir}/apptype_id_name.txt', delimiter='\\t', names =['type_id', 'type_name'] )\n", 225 | "print(app_type.shape)\n", 226 | "app_type.head()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 65, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "(30000, 3)\n" 239 | ] 240 | }, 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
\n", 245 | "\n", 258 | "\n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | "
app_idtype_idapp_des
000000777CE5B5AA5C1AC94DB8EABE0AC140203《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...
10000DEC36E15C27DBFC64AB8208C4B37140206更稳定、更优质,邀您一起。
20001791406307B1D1CE2BC64A830B7C7142106《小钱袋》是一款免费网络版记帐软件,适用于个人记帐、家庭记帐、团队记帐,全程帮您安全记录您财...
30002F14825B9CA01653325EEFD69D790142701领先的周易服务平台高人汇,汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...
4000419D79365331F89399E5F38A91B05140901平行空间是一款极简、免费的黑科技双开助手;您可以在平行空间双开微信微博、陌陌、映客、yy等应...
\n", 300 | "
" 301 | ], 302 | "text/plain": [ 303 | " app_id type_id \\\n", 304 | "0 00000777CE5B5AA5C1AC94DB8EABE0AC 140203 \n", 305 | "1 0000DEC36E15C27DBFC64AB8208C4B37 140206 \n", 306 | "2 0001791406307B1D1CE2BC64A830B7C7 142106 \n", 307 | "3 0002F14825B9CA01653325EEFD69D790 142701 \n", 308 | "4 000419D79365331F89399E5F38A91B05 140901 \n", 309 | "\n", 310 | " app_des \n", 311 | "0 《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。... \n", 312 | "1 更稳定、更优质,邀您一起。 \n", 313 | "2 《小钱袋》是一款免费网络版记帐软件,适用于个人记帐、家庭记帐、团队记帐,全程帮您安全记录您财... \n", 314 | "3 领先的周易服务平台高人汇,汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面... \n", 315 | "4 平行空间是一款极简、免费的黑科技双开助手;您可以在平行空间双开微信微博、陌陌、映客、yy等应... " 316 | ] 317 | }, 318 | "execution_count": 65, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "import csv\n", 325 | "apptype_train = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\\t', \n", 326 | " names =['app_id', 'type_id', 'app_des'] , \n", 327 | " quoting=3\n", 328 | " )\n", 329 | "print(apptype_train.shape)\n", 330 | "apptype_train.head()\n", 331 | "#apptype_train.iloc[2,2]" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 66, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/html": [ 342 | "
\n", 343 | "\n", 356 | "\n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | "
app_idtype_idapp_des
1594563959834D8FB9D68C03A75C9BB0906EA140206在全新的地图中,你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团,肩负起守卫天空之城的重任,同...
1594663961F67B88D3D7D877101F80A53E5CD140901部分小错误,整体。
159476396C70B6383F0BF243EF69927ACF35F140901以太大陆EthMin 以太大陆是一个数字生态世界,帮助个体管理在现实世界中所付出的努力与贡献...
159486396F4C27E1F1D86762B9283D701DB78142501帮助准妈妈在分娩前记录宫缩频率和时长,以判断是否达到就医标准。遇到问题,可1对1在线咨询产科...
1594963997AB7F3E277BC0CB1D42C3D8360F4142103线上线下优势资源整合  不必四处奔波,专属咨询顾问为您服务。  安心快速无抵押  去繁求简,...
15950639B889103E0AFD7D23E8C593DB6A6D1140211更稳定、更优质,邀您一起。
15951639BC48DB51B5806B726B392224F0CA8142102金钱永不眠一个股票账户,一笔钱投资美股/港股/英股/A股;全球资产配置的一站式股票平台享受一...
15952639C08D6CA2142E0CFD60E64DFB7C326140901文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成,一款专业进行文字转语...
15953639C9663BB3CABFA048B3A54ED9B8CC9140401在微博,官方发布新闻,草根爆料八卦;在微博,大V明星发布动态,粉丝狗仔爆料内幕;在微博,海量...
15954639DBC25084151D681F73C1A331B6CBA140210比斗地主麻将更简单、比炸金花牛牛更刺激,全球百姓共同推荐锻炼情商智商,大奖话费信手拈来。客官...
1595563A0262B4C6D416DC2816B15E716C31D142103\"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包,帮您闪电周转...
1595663A09CBD3873CE47A42BC285705B8431140901这是一款优客工场官方发布的移动客户端,涵盖了优客工场支持的所有辅助功能,旨在为你和你的团队提...
1595763A0C80FD1F955F8C53AFE69291EC652140107天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...
1595863A2BCCA93BB9ED948A2892CFEF4CFCE140207喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...
1595963A9AFD1952D1EF58A3743CA5BD76602140206住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...
1596063AA2D5AFFD768100625F947BA030B48142105一款信息查询辅助应用软件,服务广大福彩彩民,为福彩中心的各种促销活动提供平台支持。主要功能纸...
1596163AB66CD8D27C6B269F4960FB530AA76142104国美金融App简介国美金融App是综合性金融服务平台,致力为个人和企业提供定制化财富管理服务...
1596263AD9FA5338921C66943390ADA5DCF23142102华福证券网上开户2.0系统,采用人脸单向识别技术,优化开户排队机制,可绑定老账户,为您节省宝...
1596363AF8C9C9E16F935F8F424533D24FD40140701软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...
1596463B0E4D5A2319B7684D8959D3703B7C4140404曹雪芹著的经典文学名著《红楼梦》,特殊的幼儿早教方式,让您的孩子赢在起跑线上,大人和儿童皆宜...
1596563B1568A4BA00BB36247F3FE7E63D046140210大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来,锻炼自己,让自己成为万人瞩目的象棋大师...
1596663B5F7FD3037C633611E405BF76357A6140901流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...
1596763B6A2A65E22AB3BEF8E6E3627058005140901三星应用商店GlxyApps是三星官方开发和运营的应用下载平台,拥有数十万款应用、游戏和主题...
1596863B8474AE7D557EB69107C1C8D67293B140212海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权,对比经典抢...
1596963B89ACCF7E7BB4E8048A4430A61198E140603《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...
1597063BA67638DB01DFD3BE2B89A6DA9C632140802乌鲁木齐地铁官方APP,为您提供全新出行方式。地铁购票&mdash;&mdash;乘客可通过...
1597163BA6BE8E50C34BC6C08F39487BF3063140404产品简介免费追书吧,一款专业免费的电子书阅读软件,爱阅读的小伙伴千万不要错过。全本小说免费阅...
1597263BBCDE7DE3AE668D03FAB004E986F4F140301|140604人人是一个火爆校园的高颜值网红美女视频交友直播平台,同城交友约会聊天,明星艺人在线直播,72...
1597363BD79538A92F05644BE6AD23D87B545140603铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声,轻松...
1597463BF35D999C3B21BB0E783CD56FD60D0140207《侍灵》是一款日式暗黑系的横版格斗QTE手游,在动作游戏领域中做出了大胆的创新,通过策略搭配...
1597563BFFE1204509BBA9BD9E0E406FB2A38142103风云管家,信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...
1597663C0F5069E829510104C56911CF571D1140207两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...
1597763C5FA30A92F3B99258FA6085EE90D91141201通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...
1597863CA760775B2CD3D62995F657568CC8E141001宝宝学加减法,是宝宝巴士专为5+宝贝设计的数学软件,让孩子轻松学习加减法,赢在起跑线~难度贴...
1597963CB103A546C380870C8A3FA53A14208140113长安通APP是一款便民生活服务软件。用户使用长安通APP,可以实现长安通卡NFC充值、查询余...
\n", 578 | "
" 579 | ], 580 | "text/plain": [ 581 | " app_id type_id \\\n", 582 | "15945 63959834D8FB9D68C03A75C9BB0906EA 140206 \n", 583 | "15946 63961F67B88D3D7D877101F80A53E5CD 140901 \n", 584 | "15947 6396C70B6383F0BF243EF69927ACF35F 140901 \n", 585 | "15948 6396F4C27E1F1D86762B9283D701DB78 142501 \n", 586 | "15949 63997AB7F3E277BC0CB1D42C3D8360F4 142103 \n", 587 | "15950 639B889103E0AFD7D23E8C593DB6A6D1 140211 \n", 588 | "15951 639BC48DB51B5806B726B392224F0CA8 142102 \n", 589 | "15952 639C08D6CA2142E0CFD60E64DFB7C326 140901 \n", 590 | "15953 639C9663BB3CABFA048B3A54ED9B8CC9 140401 \n", 591 | "15954 639DBC25084151D681F73C1A331B6CBA 140210 \n", 592 | "15955 63A0262B4C6D416DC2816B15E716C31D 142103 \n", 593 | "15956 63A09CBD3873CE47A42BC285705B8431 140901 \n", 594 | "15957 63A0C80FD1F955F8C53AFE69291EC652 140107 \n", 595 | "15958 63A2BCCA93BB9ED948A2892CFEF4CFCE 140207 \n", 596 | "15959 63A9AFD1952D1EF58A3743CA5BD76602 140206 \n", 597 | "15960 63AA2D5AFFD768100625F947BA030B48 142105 \n", 598 | "15961 63AB66CD8D27C6B269F4960FB530AA76 142104 \n", 599 | "15962 63AD9FA5338921C66943390ADA5DCF23 142102 \n", 600 | "15963 63AF8C9C9E16F935F8F424533D24FD40 140701 \n", 601 | "15964 63B0E4D5A2319B7684D8959D3703B7C4 140404 \n", 602 | "15965 63B1568A4BA00BB36247F3FE7E63D046 140210 \n", 603 | "15966 63B5F7FD3037C633611E405BF76357A6 140901 \n", 604 | "15967 63B6A2A65E22AB3BEF8E6E3627058005 140901 \n", 605 | "15968 63B8474AE7D557EB69107C1C8D67293B 140212 \n", 606 | "15969 63B89ACCF7E7BB4E8048A4430A61198E 140603 \n", 607 | "15970 63BA67638DB01DFD3BE2B89A6DA9C632 140802 \n", 608 | "15971 63BA6BE8E50C34BC6C08F39487BF3063 140404 \n", 609 | "15972 63BBCDE7DE3AE668D03FAB004E986F4F 140301|140604 \n", 610 | "15973 63BD79538A92F05644BE6AD23D87B545 140603 \n", 611 | "15974 63BF35D999C3B21BB0E783CD56FD60D0 140207 \n", 612 | "15975 63BFFE1204509BBA9BD9E0E406FB2A38 142103 \n", 613 | "15976 63C0F5069E829510104C56911CF571D1 140207 \n", 614 | "15977 63C5FA30A92F3B99258FA6085EE90D91 141201 \n", 615 | "15978 63CA760775B2CD3D62995F657568CC8E 141001 \n", 616 | "15979 63CB103A546C380870C8A3FA53A14208 140113 \n", 617 | "\n", 618 | " app_des \n", 619 | "15945 在全新的地图中,你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团,肩负起守卫天空之城的重任,同... \n", 620 | "15946 部分小错误,整体。 \n", 621 | "15947 以太大陆EthMin 以太大陆是一个数字生态世界,帮助个体管理在现实世界中所付出的努力与贡献... \n", 622 | "15948 帮助准妈妈在分娩前记录宫缩频率和时长,以判断是否达到就医标准。遇到问题,可1对1在线咨询产科... \n", 623 | "15949 线上线下优势资源整合  不必四处奔波,专属咨询顾问为您服务。  安心快速无抵押  去繁求简,... \n", 624 | "15950 更稳定、更优质,邀您一起。 \n", 625 | "15951 金钱永不眠一个股票账户,一笔钱投资美股/港股/英股/A股;全球资产配置的一站式股票平台享受一... \n", 626 | "15952 文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成,一款专业进行文字转语... \n", 627 | "15953 在微博,官方发布新闻,草根爆料八卦;在微博,大V明星发布动态,粉丝狗仔爆料内幕;在微博,海量... \n", 628 | "15954 比斗地主麻将更简单、比炸金花牛牛更刺激,全球百姓共同推荐锻炼情商智商,大奖话费信手拈来。客官... \n", 629 | "15955 \"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包,帮您闪电周转... \n", 630 | "15956 这是一款优客工场官方发布的移动客户端,涵盖了优客工场支持的所有辅助功能,旨在为你和你的团队提... \n", 631 | "15957 天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒... \n", 632 | "15958 喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主... \n", 633 | "15959 住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm... \n", 634 | "15960 一款信息查询辅助应用软件,服务广大福彩彩民,为福彩中心的各种促销活动提供平台支持。主要功能纸... \n", 635 | "15961 国美金融App简介国美金融App是综合性金融服务平台,致力为个人和企业提供定制化财富管理服务... \n", 636 | "15962 华福证券网上开户2.0系统,采用人脸单向识别技术,优化开户排队机制,可绑定老账户,为您节省宝... \n", 637 | "15963 软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900... \n", 638 | "15964 曹雪芹著的经典文学名著《红楼梦》,特殊的幼儿早教方式,让您的孩子赢在起跑线上,大人和儿童皆宜... \n", 639 | "15965 大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来,锻炼自己,让自己成为万人瞩目的象棋大师... \n", 640 | "15966 流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版... \n", 641 | "15967 三星应用商店GlxyApps是三星官方开发和运营的应用下载平台,拥有数十万款应用、游戏和主题... \n", 642 | "15968 海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权,对比经典抢... \n", 643 | "15969 《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一... \n", 644 | "15970 乌鲁木齐地铁官方APP,为您提供全新出行方式。地铁购票——乘客可通过... \n", 645 | "15971 产品简介免费追书吧,一款专业免费的电子书阅读软件,爱阅读的小伙伴千万不要错过。全本小说免费阅... \n", 646 | "15972 人人是一个火爆校园的高颜值网红美女视频交友直播平台,同城交友约会聊天,明星艺人在线直播,72... \n", 647 | "15973 铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声,轻松... \n", 648 | "15974 《侍灵》是一款日式暗黑系的横版格斗QTE手游,在动作游戏领域中做出了大胆的创新,通过策略搭配... \n", 649 | "15975 风云管家,信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮... \n", 650 | "15976 两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注... \n", 651 | "15977 通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引... \n", 652 | "15978 宝宝学加减法,是宝宝巴士专为5+宝贝设计的数学软件,让孩子轻松学习加减法,赢在起跑线~难度贴... \n", 653 | "15979 长安通APP是一款便民生活服务软件。用户使用长安通APP,可以实现长安通卡NFC充值、查询余... " 654 | ] 655 | }, 656 | "execution_count": 66, 657 | "metadata": {}, 658 | "output_type": "execute_result" 659 | } 660 | ], 661 | "source": [ 662 | "apptype_train.iloc[15945:15980]" 663 | ] 664 | } 665 | ], 666 | "metadata": { 667 | "kernelspec": { 668 | "display_name": "Python 3", 669 | "language": "python", 670 | "name": "python3" 671 | }, 672 | "language_info": { 673 | "codemirror_mode": { 674 | "name": "ipython", 675 | "version": 3 676 | }, 677 | "file_extension": ".py", 678 | "mimetype": "text/x-python", 679 | "name": "python", 680 | "nbconvert_exporter": "python", 681 | "pygments_lexer": "ipython3", 682 | "version": "3.6.8" 683 | } 684 | }, 685 | "nbformat": 4, 686 | "nbformat_minor": 2 687 | } 688 | -------------------------------------------------------------------------------- /notebook/Untitled1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n", 14 | "Using TensorFlow backend.\n" 15 | ] 16 | }, 17 | { 18 | "ename": "ImportError", 19 | "evalue": "Traceback (most recent call last):\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\", line 58, in \n from tensorflow.python.pywrap_tensorflow_internal import *\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 28, in \n _pywrap_tensorflow_internal = swig_import_helper()\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 24, in swig_import_helper\n _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)\n File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 243, in load_module\n return load_dynamic(name, filename, file)\n File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 343, in load_dynamic\n return _load(spec)\nImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory\n\n\nFailed to load the native TensorFlow runtime.\n\nSee https://www.tensorflow.org/install/errors\n\nfor some common reasons and solutions. Include the entire stack trace\nabove this error message when asking for help.", 20 | "output_type": "error", 21 | "traceback": [ 22 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 23 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 24 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpywrap_tensorflow_internal\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpywrap_tensorflow_internal\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m__version__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 25 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_mod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0m_pywrap_tensorflow_internal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mswig_import_helper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mswig_import_helper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 26 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\u001b[0m in \u001b[0;36mswig_import_helper\u001b[0;34m()\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m \u001b[0m_mod\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_module\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_pywrap_tensorflow_internal'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpathname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 27 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\u001b[0m in \u001b[0;36mload_module\u001b[0;34m(name, file, filename, details)\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mload_dynamic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 244\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mtype_\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mPKG_DIRECTORY\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 28 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\u001b[0m in \u001b[0;36mload_dynamic\u001b[0;34m(name, path, file)\u001b[0m\n\u001b[1;32m 342\u001b[0m name=name, loader=loader, origin=path)\n\u001b[0;32m--> 343\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 344\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 29 | "\u001b[0;31mImportError\u001b[0m: libcublas.so.9.0: cannot open shared object file: No such file or directory", 30 | "\nDuring handling of the above exception, another exception occurred:\n", 31 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 32 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDense\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSequential\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCustomObjectScope\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mkeras\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 33 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m__future__\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mabsolute_import\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mactivations\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mapplications\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 34 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/utils/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdata_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mio_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mconv_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# Globally-importable utils.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 35 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/utils/conv_utils.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmoves\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mbackend\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mK\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 36 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/backend/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0m_BACKEND\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'tensorflow'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Using TensorFlow backend.\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtensorflow_backend\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;31m# Try and load external backend.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 37 | "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m__future__\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mprint_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframework\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mops\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf_ops\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmoving_averages\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 38 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# pylint: disable=g-bad-import-order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpywrap_tensorflow\u001b[0m \u001b[0;31m# pylint: disable=unused-import\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 39 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 49\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpywrap_tensorflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtools\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcomponent_api_helper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 40 | "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msome\u001b[0m \u001b[0mcommon\u001b[0m \u001b[0mreasons\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0msolutions\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mInclude\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mentire\u001b[0m \u001b[0mstack\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m above this error message when asking for help.\"\"\" % traceback.format_exc()\n\u001b[0;32m---> 74\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;31m# pylint: enable=wildcard-import,g-import-not-at-top,unused-import,line-too-long\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 41 | "\u001b[0;31mImportError\u001b[0m: Traceback (most recent call last):\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\", line 58, in \n from tensorflow.python.pywrap_tensorflow_internal import *\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 28, in \n _pywrap_tensorflow_internal = swig_import_helper()\n File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 24, in swig_import_helper\n _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)\n File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 243, in load_module\n return load_dynamic(name, filename, file)\n File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 343, in load_dynamic\n return _load(spec)\nImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory\n\n\nFailed to load the native TensorFlow runtime.\n\nSee https://www.tensorflow.org/install/errors\n\nfor some common reasons and solutions. Include the entire stack trace\nabove this error message when asking for help." 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "from keras.layers import Dense\n", 47 | "from keras.models import load_model, Sequential\n", 48 | "from keras.utils import CustomObjectScope\n", 49 | "import tensorflow as tf\n", 50 | "import keras as keras\n", 51 | "from keras import backend\n", 52 | "from keras.models import Model\n", 53 | "from keras.optimizers import Adam\n", 54 | "from keras import losses \n", 55 | "\n", 56 | "def get_test_mode():\n", 57 | " model = Sequential()\n", 58 | " from keras.layers import Flatten\n", 59 | " input = keras.Input(shape=(10,10), dtype='float', name='raw_image_left')\n", 60 | " test = keras.layers.Dense(units=64, activation='relu', name='middle', )(input)\n", 61 | " test = keras.layers.Flatten()(test)\n", 62 | " #Comments this line will be correct, which is only incorrect in CPU\n", 63 | " test = keras.layers.Lambda(lambda x: keras.backend.sqrt(x), name='error_point')(test)\n", 64 | " output = keras.layers.Dense(units=1, activation='sigmoid', name='output', )(test)\n", 65 | " model = Model(input, output)\n", 66 | " return model\n", 67 | "\n", 68 | "model = get_test_mode()\n", 69 | "model.summary()\n", 70 | "\n", 71 | "opt = Adam()\n", 72 | "model.compile(optimizer=opt, loss=losses.binary_crossentropy )\n", 73 | "\n", 74 | "import numpy as np\n", 75 | "X = np.random.randint(255, size=(100, 10, 10))\n", 76 | "y = np.random.randint(2, size=(100))\n", 77 | " \n", 78 | " \n", 79 | "model.fit(X, y, epochs=3, batch_size=10)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 2, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "ai-prd-05\r\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "!hostname" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python [conda env:python36new]", 103 | "language": "python", 104 | "name": "conda-env-python36new-py" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.6.7" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /notebook/Untitled2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 5, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "./cache/get_oof_version=v75,4,0=.pickle\n", 29 | "./cache/get_oof_version=v36,4,0=.pickle\n", 30 | "./cache/get_oof_version=v72,4,0=.pickle\n", 31 | "./cache/get_oof_version=v74,4,0=.pickle\n", 32 | "./cache/get_oof_version=v43,4,0=.pickle\n", 33 | "./cache/get_oof_version=v73,4,0=.pickle\n" 34 | ] 35 | } 36 | ], 37 | "source": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 7, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "(130000, 127)" 48 | ] 49 | }, 50 | "execution_count": 7, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "pd.read_hdf('./cache/get_oof_version=v75,4,0=.h5').shape" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "Python [default]", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.6.7" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /notebook/train_v2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_uuid": "887cc6be0765929d5e382a830efdb902bd9ce99b" 7 | }, 8 | "source": [ 9 | "# Baidu Emotion.\n", 10 | "\n", 11 | "### Let's start exploring the dataset" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "2019-01-02 20:14:52,472 util_log.py[61] DEBUG Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "#Adjust the working folder\n", 29 | "import sys\n", 30 | "import os\n", 31 | "#print(globals())\n", 32 | "file_folder = globals()['_dh'][0]\n", 33 | "wk_dir = os.path.dirname(file_folder)\n", 34 | "os.chdir(wk_dir)\n", 35 | "\n", 36 | "import pandas as pd\n", 37 | "\n", 38 | "from code_felix.core.config import *\n", 39 | "from code_felix.core.feature import *\n", 40 | "from file_cache.utils.util_log import *\n", 41 | "\n", 42 | "import matplotlib.pyplot as plt\n", 43 | "\n", 44 | "\n", 45 | "plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签\n", 46 | "plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 54 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import numpy as np # linear algebra\n", 60 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 61 | "\n", 62 | "\n", 63 | "#Loading the dataset\n", 64 | "dataset = pd.read_csv(train_file, encoding='gb18030', delimiter='\\t', header=None)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 72 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 73 | }, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/html": [ 78 | "
\n", 79 | "\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
0123
01食品餐饮买这套系统本来是用来做我们公司的公众号第三方平台代运营的,没想到还有app,而且每个都很方便...2
12食品餐饮烤鸭还是不错的,别的菜没什么特殊的1
23食品餐饮使用说明看不懂!不会用,很多操作没详细标明!0
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | " 0 1 2 3\n", 130 | "0 1 食品餐饮 买这套系统本来是用来做我们公司的公众号第三方平台代运营的,没想到还有app,而且每个都很方便... 2\n", 131 | "1 2 食品餐饮 烤鸭还是不错的,别的菜没什么特殊的 1\n", 132 | "2 3 食品餐饮 使用说明看不懂!不会用,很多操作没详细标明! 0" 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# Prin some samples\n", 142 | "dataset.head(3)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "_uuid": "b2e3ea8267e3bac72a30ce7803413c684bc8b9a4" 149 | }, 150 | "source": [ 151 | "## Preparing data for model training\n", 152 | "### Tokenization\n", 153 | "Since the data is already tokenized and lowercased, we just need to split the words\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 4, 159 | "metadata": { 160 | "_uuid": "cb739f05cfb4b5d74702cdef1ea5a130c0d90132" 161 | }, 162 | "outputs": [ 163 | { 164 | "name": "stderr", 165 | "output_type": "stream", 166 | "text": [ 167 | "Building prefix dict from the default dictionary ...\n", 168 | "2019-01-02 20:14:52,758 __init__.py[111] DEBUG Building prefix dict from the default dictionary ...\n", 169 | "Loading model from cache /var/folders/d2/vq91lnt11m13m84s18dzdm8r0000gn/T/jieba.cache\n", 170 | "2019-01-02 20:14:52,766 __init__.py[131] DEBUG Loading model from cache /var/folders/d2/vq91lnt11m13m84s18dzdm8r0000gn/T/jieba.cache\n", 171 | "Loading model cost 0.715 seconds.\n", 172 | "2019-01-02 20:14:53,479 __init__.py[163] DEBUG Loading model cost 0.715 seconds.\n", 173 | "Prefix dict has been built succesfully.\n", 174 | "2019-01-02 20:14:53,483 __init__.py[164] DEBUG Prefix dict has been built succesfully.\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "input_sentences = [list(jieba.cut(str(text), cut_all=False)) for text in dataset.iloc[:, 2].values.tolist()]\n", 180 | "labels = dataset.iloc[:, 3].values.tolist()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "[2, 1, 0, 0, 1, 2, 2, 2, 2, 2]" 192 | ] 193 | }, 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "labels[:10]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "_uuid": "1a7c2e03d7e839b2872785157153e0bfef82b0bd" 207 | }, 208 | "source": [ 209 | "### Creating Vocabulary (word index)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 14, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stderr", 219 | "output_type": "stream", 220 | "text": [ 221 | "2019-01-02 20:16:43,159 util_log.py[41] INFO get_word_id_vec begin with(1 paras) :['02'], []\n", 222 | "2019-01-02 20:16:43,175 cache.py[29] DEBUG try to read cache from file:./cache/get_word_id_vec=02=.h5, (h5, key:['/df_0'])\n", 223 | "2019-01-02 20:16:43,221 util_log.py[49] INFO get_word_id_vec cost 0.06 sec:(1 paras)(['02'], []), return:DataFrame, end \n", 224 | "2019-01-02 20:16:47,027 [3] DEBUG Word length:42014\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "word_id_vec = get_word_id_vec('02')\n", 230 | "word2id = dict( word_id_vec.apply(lambda row: (row['word'], row['id']), axis=1).values )\n", 231 | "logger.debug(f'Word length:{len(word2id)}')\n", 232 | "\n", 233 | "\n", 234 | "embedding_weights = word_id_vec.iloc[:, -vector_size:].fillna(0).values\n", 235 | "#embedding_weights[10]" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 7, 241 | "metadata": { 242 | "_uuid": "f60be75ae0d5cbfc36eeba0243407b66741bb42e" 243 | }, 244 | "outputs": [ 245 | { 246 | "name": "stderr", 247 | "output_type": "stream", 248 | "text": [ 249 | "2019-01-02 20:14:58,137 [18] DEBUG max_words=38\n", 250 | "2019-01-02 20:14:58,141 [18] DEBUG max_words=73\n", 251 | "2019-01-02 20:14:58,143 [18] DEBUG max_words=77\n", 252 | "2019-01-02 20:14:58,146 [18] DEBUG max_words=88\n", 253 | "2019-01-02 20:14:58,148 [18] DEBUG max_words=93\n", 254 | "2019-01-02 20:14:58,150 [18] DEBUG max_words=110\n", 255 | "2019-01-02 20:14:58,152 [18] DEBUG max_words=151\n", 256 | "2019-01-02 20:14:58,154 [18] DEBUG max_words=171\n", 257 | "2019-01-02 20:14:58,156 [18] DEBUG max_words=177\n", 258 | "2019-01-02 20:14:58,158 [18] DEBUG max_words=196\n" 259 | ] 260 | }, 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "{0: 0, 1: 1, 2: 2}" 265 | ] 266 | }, 267 | "execution_count": 7, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "# Initialize word2id and label2id dictionaries that will be used to encode words and labels\n", 274 | "\n", 275 | "label2id = dict()\n", 276 | "\n", 277 | "max_words = 0 # maximum number of words in a sentence\n", 278 | "\n", 279 | "# Construction of word2id dict\n", 280 | "for sentence in input_sentences:\n", 281 | "# for word in sentence:\n", 282 | "# # Add words to word2id dict if not exist\n", 283 | "# if word not in word2id:\n", 284 | "# word2id[word] = len(word2id)\n", 285 | "# # If length of the sentence is greater than max_words, update max_words\n", 286 | "# sentence = list(sentence)\n", 287 | "# logger.debug(f'{len(sentence)} : {sentence}')\n", 288 | " if len(sentence) > max_words:\n", 289 | " max_words = len(sentence)\n", 290 | " logger.debug(f'max_words={max_words}')\n", 291 | " \n", 292 | "# Construction of label2id and id2label dicts\n", 293 | "label2id = {l: i for i, l in enumerate(set(labels))}\n", 294 | "id2label = {v: k for k, v in label2id.items()}\n", 295 | "id2label" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "_uuid": "d984e58ffd25530ac4c05ce623d9237a35cf903d" 302 | }, 303 | "source": [ 304 | "### Encoding samples with corresponing integer values" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 8, 310 | "metadata": { 311 | "_uuid": "378ef884a6ebb19b02a70082bc6c854c51780af3" 312 | }, 313 | "outputs": [ 314 | { 315 | "name": "stderr", 316 | "output_type": "stream", 317 | "text": [ 318 | "/Users/lali2/dev/python/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 319 | " from ._conv import register_converters as _register_converters\n", 320 | "Using TensorFlow backend.\n", 321 | "/Users/lali2/dev/python/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 322 | " return f(*args, **kwds)\n" 323 | ] 324 | }, 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "Shape of X: (2000, 196)\n", 330 | "Shape of Y: (2000, 3)\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "import keras\n", 336 | "\n", 337 | "# Encode input words and labels\n", 338 | "X = [[word2id[word] for word in sentence] for sentence in input_sentences]\n", 339 | "Y = [label2id[label] for label in labels]\n", 340 | "\n", 341 | "# Apply Padding to X\n", 342 | "from keras.preprocessing.sequence import pad_sequences\n", 343 | "X = pad_sequences(X, max_words)\n", 344 | "\n", 345 | "# Convert Y to numpy array\n", 346 | "Y = keras.utils.to_categorical(Y, num_classes=len(label2id))\n", 347 | "\n", 348 | "# Print shapes\n", 349 | "print(\"Shape of X: {}\".format(X.shape))\n", 350 | "print(\"Shape of Y: {}\".format(Y.shape))\n" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": { 356 | "_uuid": "4bccaa5b813414ad7929522d4d0f74dbb9c4c5af" 357 | }, 358 | "source": [ 359 | "## Build LSTM model with attention " 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 9, 365 | "metadata": { 366 | "collapsed": true 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "keras.layers.Embedding?" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 21, 376 | "metadata": { 377 | "_uuid": "4c1b5fc7613a0fe5a8067135e2de07e0765f1b78" 378 | }, 379 | "outputs": [ 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "__________________________________________________________________________________________________\n", 385 | "Layer (type) Output Shape Param # Connected to \n", 386 | "==================================================================================================\n", 387 | "input_7 (InputLayer) (None, 196) 0 \n", 388 | "__________________________________________________________________________________________________\n", 389 | "embedding_7 (Embedding) (None, 196, 200) 8402800 input_7[0][0] \n", 390 | "__________________________________________________________________________________________________\n", 391 | "dropout_3 (Dropout) (None, 196, 200) 0 embedding_7[0][0] \n", 392 | "__________________________________________________________________________________________________\n", 393 | "bidirectional_2 (Bidirectional) (None, 196, 200) 240800 dropout_3[0][0] \n", 394 | "__________________________________________________________________________________________________\n", 395 | "dropout_4 (Dropout) (None, 196, 200) 0 bidirectional_2[0][0] \n", 396 | "__________________________________________________________________________________________________\n", 397 | "time_distributed_2 (TimeDistrib (None, 196, 1) 201 dropout_4[0][0] \n", 398 | "__________________________________________________________________________________________________\n", 399 | "reshape_2 (Reshape) (None, 196) 0 time_distributed_2[0][0] \n", 400 | "__________________________________________________________________________________________________\n", 401 | "attention_vec (Activation) (None, 196) 0 reshape_2[0][0] \n", 402 | "__________________________________________________________________________________________________\n", 403 | "dot_2 (Dot) (None, 200) 0 dropout_4[0][0] \n", 404 | " attention_vec[0][0] \n", 405 | "__________________________________________________________________________________________________\n", 406 | "dense_5 (Dense) (None, 100) 20100 dot_2[0][0] \n", 407 | "__________________________________________________________________________________________________\n", 408 | "dense_6 (Dense) (None, 3) 303 dense_5[0][0] \n", 409 | "==================================================================================================\n", 410 | "Total params: 8,664,204\n", 411 | "Trainable params: 8,664,204\n", 412 | "Non-trainable params: 0\n", 413 | "__________________________________________________________________________________________________\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "embedding_dim = 100 # The dimension of word embeddings\n", 419 | "\n", 420 | "# Define input tensor\n", 421 | "sequence_input = keras.Input(shape=(max_words,), dtype='int32')\n", 422 | "\n", 423 | "# Word embedding layer\n", 424 | "embedded_inputs =keras.layers.Embedding(len(word2id) ,\n", 425 | " vector_size ,\n", 426 | " input_length=max_words ,\n", 427 | " weights = [embedding_weights] ,\n", 428 | " )(sequence_input)\n", 429 | "\n", 430 | "# Apply dropout to prevent overfitting\n", 431 | "embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)\n", 432 | "\n", 433 | "# Apply Bidirectional LSTM over embedded inputs\n", 434 | "lstm_outs = keras.layers.wrappers.Bidirectional(\n", 435 | " keras.layers.LSTM(embedding_dim, return_sequences=True)\n", 436 | ")(embedded_inputs)\n", 437 | "\n", 438 | "# Apply dropout to LSTM outputs to prevent overfitting\n", 439 | "lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)\n", 440 | "\n", 441 | "# Attention Mechanism - Generate attention vectors\n", 442 | "input_dim = int(lstm_outs.shape[2])\n", 443 | "permuted_inputs = keras.layers.Permute((2, 1))(lstm_outs)\n", 444 | "attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)\n", 445 | "attention_vector = keras.layers.Reshape((max_words,))(attention_vector)\n", 446 | "attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)\n", 447 | "attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])\n", 448 | "\n", 449 | "# Last layer: fully connected with softmax activation\n", 450 | "fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)\n", 451 | "output = keras.layers.Dense(len(label2id), activation='softmax')(fc)\n", 452 | "\n", 453 | "# Finally building model\n", 454 | "model = keras.Model(inputs=[sequence_input], outputs=output)\n", 455 | "model.compile(loss=\"categorical_crossentropy\", metrics=[\"accuracy\"], optimizer='adam')\n", 456 | "\n", 457 | "# Print model summary\n", 458 | "model.summary()\n", 459 | "\n", 460 | "\n", 461 | "\n" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "_uuid": "ad67135dcd65940d864521309066ff9fb5b7c9a2" 468 | }, 469 | "source": [ 470 | "## Training the model" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 22, 476 | "metadata": { 477 | "_uuid": "d9441f027a63ad3c8b288c6823e073b142c33b34" 478 | }, 479 | "outputs": [ 480 | { 481 | "name": "stdout", 482 | "output_type": "stream", 483 | "text": [ 484 | "Train on 1800 samples, validate on 200 samples\n", 485 | "Epoch 1/2\n", 486 | "1800/1800 [==============================] - 18s 10ms/step - loss: 0.8834 - acc: 0.6761 - val_loss: 0.8265 - val_acc: 0.6950\n", 487 | "Epoch 2/2\n", 488 | "1800/1800 [==============================] - 16s 9ms/step - loss: 0.8213 - acc: 0.6778 - val_loss: 0.6513 - val_acc: 0.7150\n" 489 | ] 490 | }, 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "" 495 | ] 496 | }, 497 | "execution_count": 22, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | } 501 | ], 502 | "source": [ 503 | "# Train model 10 iterations\n", 504 | "model.fit(X, Y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": { 510 | "_uuid": "b37aca89d92439a9777bb7634dcd12aef2162771" 511 | }, 512 | "source": [ 513 | "The accuracy on validation data about 93%. Very good result for a classification task with six-classes.\n", 514 | "The performance can be further improved by training the model a few more iteration.\n", 515 | "\n", 516 | "**Let's look closer to model predictions and attentions**" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": { 523 | "_uuid": "6a5e94835a8aa88b8609a95e80add37fc1ffd4d7", 524 | "collapsed": true 525 | }, 526 | "outputs": [], 527 | "source": [ 528 | "# Re-create the model to get attention vectors as well as label prediction\n", 529 | "model_with_attentions = keras.Model(inputs=model.input,\n", 530 | " outputs=[model.output, \n", 531 | " model.get_layer('attention_vec').output])" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "_uuid": "f7f1a8770b09a221787e38376392ba977172c215", 539 | "collapsed": true, 540 | "scrolled": true 541 | }, 542 | "outputs": [], 543 | "source": [ 544 | "import random\n", 545 | "import math\n", 546 | "\n", 547 | "# Select random samples to illustrate\n", 548 | "sample_text = random.choice(dataset[\"text\"].values.tolist())\n", 549 | "\n", 550 | "# Encode samples\n", 551 | "tokenized_sample = sample_text.split(\" \")\n", 552 | "encoded_samples = [[word2id[word] for word in tokenized_sample]]\n", 553 | "\n", 554 | "# Padding\n", 555 | "encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)\n", 556 | "\n", 557 | "# Make predictions\n", 558 | "label_probs, attentions = model_with_attentions.predict(encoded_samples)\n", 559 | "label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(),label_probs[0])}\n", 560 | "\n", 561 | "# Get word attentions using attenion vector\n", 562 | "token_attention_dic = {}\n", 563 | "max_score = 0.0\n", 564 | "min_score = 0.0\n", 565 | "for token, attention_score in zip(tokenized_sample, attentions[0][-len(tokenized_sample):]):\n", 566 | " token_attention_dic[token] = math.sqrt(attention_score)\n", 567 | "\n", 568 | "\n", 569 | "# VISUALIZATION\n", 570 | "import matplotlib.pyplot as plt; plt.rcdefaults()\n", 571 | "import numpy as np\n", 572 | "import matplotlib.pyplot as plt\n", 573 | "from IPython.core.display import display, HTML\n", 574 | "\n", 575 | "def rgb_to_hex(rgb):\n", 576 | " return '#%02x%02x%02x' % rgb\n", 577 | " \n", 578 | "def attention2color(attention_score):\n", 579 | " r = 255 - int(attention_score * 255)\n", 580 | " color = rgb_to_hex((255, r, r))\n", 581 | " return str(color)\n", 582 | " \n", 583 | "# Build HTML String to viualize attentions\n", 584 | "html_text = \"

Text: \"\n", 585 | "for token, attention in token_attention_dic.items():\n", 586 | " html_text += \"{} \".format(attention2color(attention),\n", 587 | " token)\n", 588 | "html_text += \"

\"\n", 589 | "# Display text enriched with attention scores \n", 590 | "display(HTML(html_text))\n", 591 | "\n", 592 | "# PLOT EMOTION SCORES\n", 593 | "emotions = [label for label, _ in label_probs.items()]\n", 594 | "scores = [score for _, score in label_probs.items()]\n", 595 | "plt.figure(figsize=(5,2))\n", 596 | "plt.bar(np.arange(len(emotions)), scores, align='center', alpha=0.5, color=['black', 'red', 'green', 'blue', 'cyan', \"purple\"])\n", 597 | "plt.xticks(np.arange(len(emotions)), emotions)\n", 598 | "plt.ylabel('Scores')\n", 599 | "plt.show()\n", 600 | "\n" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": { 606 | "_uuid": "dd58f4f2b92b103765af428baee13a53d80eb4e9" 607 | }, 608 | "source": [ 609 | "**We have used an attention mechanism with an LSTM network to recognize emotions in given text.\n", 610 | "We show that attention mechanism can be useful for classification tasks as well as sequence labeling tasks.\n", 611 | "We have illustrated the attentions in order to make model predictions interpretable and look fancy.\n", 612 | "Enjoy attentions mechanism in different applications...**\n", 613 | "\n", 614 | "*All feedbacks are welcome.*\n", 615 | "\n" 616 | ] 617 | } 618 | ], 619 | "metadata": { 620 | "kernelspec": { 621 | "display_name": "Python 3", 622 | "language": "python", 623 | "name": "python3" 624 | }, 625 | "language_info": { 626 | "codemirror_mode": { 627 | "name": "ipython", 628 | "version": 3 629 | }, 630 | "file_extension": ".py", 631 | "mimetype": "text/x-python", 632 | "name": "python", 633 | "nbconvert_exporter": "python", 634 | "pygments_lexer": "ipython3", 635 | "version": "3.6.3" 636 | } 637 | }, 638 | "nbformat": 4, 639 | "nbformat_minor": 1 640 | } 641 | -------------------------------------------------------------------------------- /notebook/word_analysis_local.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "2019-07-04 22:42:21,041 util_log.py[128] INFO Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n", 13 | "2019-07-04 22:42:21,045 util_pandas.py[19] WARNING \"No such keys(s): 'display.height'\"\n" 14 | ] 15 | }, 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "yes\n", 21 | "/Users/lali2/Documents/workspace_py/xf_tag\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import sys\n", 27 | "import os\n", 28 | "os.chdir('../')\n", 29 | "\n", 30 | "\n", 31 | "import pandas as pd\n", 32 | "import numpy as np\n", 33 | "\n", 34 | "from bokeh.palettes import Category10\n", 35 | "\n", 36 | "\n", 37 | "from tqdm import tqdm\n", 38 | "\n", 39 | "\n", 40 | "from file_cache.utils.util_pandas import *\n", 41 | "from file_cache.cache import file_cache\n", 42 | "from functools import lru_cache\n", 43 | "from glob import glob\n", 44 | "\n", 45 | "%matplotlib inline\n", 46 | "from core.conf import *\n", 47 | "!pwd" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 62, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "(94354, 2)\n" 60 | ] 61 | }, 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | "\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | "
app_idapp_des
0BB29DA6F8167CFC99E0853741C4EB17B注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏,你用你自己的音乐比赛...
1BB2A78EA7AD4945EAF6E38997F6139A3定位试衣到家是一款基于地理位置,提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...
2BB2B1604CFA079C289FECF927DFBCE89想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质,邀您...
3BB2C7BD0B0623644183DAD08A89E1D90闽通宝手机客户端是基于移动互联网的,以公众出行服务为基础,贯彻绿色出行,低碳生活的理念,为出...
4BB2E1A8F56158E483D7461E930E6332F风靡全球的DIY照片桌面,干净、流畅,启动提速100,瞬间提升手机性能;更是一亿用户的共同选...
\n", 115 | "
" 116 | ], 117 | "text/plain": [ 118 | " app_id \\\n", 119 | "0 BB29DA6F8167CFC99E0853741C4EB17B \n", 120 | "1 BB2A78EA7AD4945EAF6E38997F6139A3 \n", 121 | "2 BB2B1604CFA079C289FECF927DFBCE89 \n", 122 | "3 BB2C7BD0B0623644183DAD08A89E1D90 \n", 123 | "4 BB2E1A8F56158E483D7461E930E6332F \n", 124 | "\n", 125 | " app_des \n", 126 | "0 注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏,你用你自己的音乐比赛... \n", 127 | "1 定位试衣到家是一款基于地理位置,提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家... \n", 128 | "2 想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质,邀您... \n", 129 | "3 闽通宝手机客户端是基于移动互联网的,以公众出行服务为基础,贯彻绿色出行,低碳生活的理念,为出... \n", 130 | "4 风靡全球的DIY照片桌面,干净、流畅,启动提速100,瞬间提升手机性能;更是一亿用户的共同选... " 131 | ] 132 | }, 133 | "execution_count": 62, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "app_desc = pd.read_csv(f'{input_dir}/app_desc.dat', delimiter='\\t', header=None, names =['app_id', 'app_des'])\n", 140 | "print(app_desc.shape)\n", 141 | "app_desc.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 63, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "(152, 2)\n" 154 | ] 155 | }, 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
type_idtype_name
01401便捷生活
11402游戏
21403通讯社交
31404阅读
41405工作求职
\n", 209 | "
" 210 | ], 211 | "text/plain": [ 212 | " type_id type_name\n", 213 | "0 1401 便捷生活\n", 214 | "1 1402 游戏\n", 215 | "2 1403 通讯社交\n", 216 | "3 1404 阅读\n", 217 | "4 1405 工作求职" 218 | ] 219 | }, 220 | "execution_count": 63, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "app_type = pd.read_csv(f'{input_dir}/apptype_id_name.txt', delimiter='\\t', names =['type_id', 'type_name'] )\n", 227 | "print(app_type.shape)\n", 228 | "app_type.head()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 65, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "(30000, 3)\n" 241 | ] 242 | }, 243 | { 244 | "data": { 245 | "text/html": [ 246 | "
\n", 247 | "\n", 260 | "\n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | "
app_idtype_idapp_des
000000777CE5B5AA5C1AC94DB8EABE0AC140203《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...
10000DEC36E15C27DBFC64AB8208C4B37140206更稳定、更优质,邀您一起。
20001791406307B1D1CE2BC64A830B7C7142106《小钱袋》是一款免费网络版记帐软件,适用于个人记帐、家庭记帐、团队记帐,全程帮您安全记录您财...
30002F14825B9CA01653325EEFD69D790142701领先的周易服务平台高人汇,汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...
4000419D79365331F89399E5F38A91B05140901平行空间是一款极简、免费的黑科技双开助手;您可以在平行空间双开微信微博、陌陌、映客、yy等应...
\n", 302 | "
" 303 | ], 304 | "text/plain": [ 305 | " app_id type_id \\\n", 306 | "0 00000777CE5B5AA5C1AC94DB8EABE0AC 140203 \n", 307 | "1 0000DEC36E15C27DBFC64AB8208C4B37 140206 \n", 308 | "2 0001791406307B1D1CE2BC64A830B7C7 142106 \n", 309 | "3 0002F14825B9CA01653325EEFD69D790 142701 \n", 310 | "4 000419D79365331F89399E5F38A91B05 140901 \n", 311 | "\n", 312 | " app_des \n", 313 | "0 《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。... \n", 314 | "1 更稳定、更优质,邀您一起。 \n", 315 | "2 《小钱袋》是一款免费网络版记帐软件,适用于个人记帐、家庭记帐、团队记帐,全程帮您安全记录您财... \n", 316 | "3 领先的周易服务平台高人汇,汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面... \n", 317 | "4 平行空间是一款极简、免费的黑科技双开助手;您可以在平行空间双开微信微博、陌陌、映客、yy等应... " 318 | ] 319 | }, 320 | "execution_count": 65, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "import csv\n", 327 | "apptype_train = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\\t', \n", 328 | " names =['app_id', 'type_id', 'app_des'] , \n", 329 | " quoting=3\n", 330 | " )\n", 331 | "print(apptype_train.shape)\n", 332 | "apptype_train.head()\n", 333 | "#apptype_train.iloc[2,2]" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 66, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/html": [ 344 | "
\n", 345 | "\n", 358 | "\n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | "
app_idtype_idapp_des
1594563959834D8FB9D68C03A75C9BB0906EA140206在全新的地图中,你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团,肩负起守卫天空之城的重任,同...
1594663961F67B88D3D7D877101F80A53E5CD140901部分小错误,整体。
159476396C70B6383F0BF243EF69927ACF35F140901以太大陆EthMin 以太大陆是一个数字生态世界,帮助个体管理在现实世界中所付出的努力与贡献...
159486396F4C27E1F1D86762B9283D701DB78142501帮助准妈妈在分娩前记录宫缩频率和时长,以判断是否达到就医标准。遇到问题,可1对1在线咨询产科...
1594963997AB7F3E277BC0CB1D42C3D8360F4142103线上线下优势资源整合  不必四处奔波,专属咨询顾问为您服务。  安心快速无抵押  去繁求简,...
15950639B889103E0AFD7D23E8C593DB6A6D1140211更稳定、更优质,邀您一起。
15951639BC48DB51B5806B726B392224F0CA8142102金钱永不眠一个股票账户,一笔钱投资美股/港股/英股/A股;全球资产配置的一站式股票平台享受一...
15952639C08D6CA2142E0CFD60E64DFB7C326140901文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成,一款专业进行文字转语...
15953639C9663BB3CABFA048B3A54ED9B8CC9140401在微博,官方发布新闻,草根爆料八卦;在微博,大V明星发布动态,粉丝狗仔爆料内幕;在微博,海量...
15954639DBC25084151D681F73C1A331B6CBA140210比斗地主麻将更简单、比炸金花牛牛更刺激,全球百姓共同推荐锻炼情商智商,大奖话费信手拈来。客官...
1595563A0262B4C6D416DC2816B15E716C31D142103\"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包,帮您闪电周转...
1595663A09CBD3873CE47A42BC285705B8431140901这是一款优客工场官方发布的移动客户端,涵盖了优客工场支持的所有辅助功能,旨在为你和你的团队提...
1595763A0C80FD1F955F8C53AFE69291EC652140107天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...
1595863A2BCCA93BB9ED948A2892CFEF4CFCE140207喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...
1595963A9AFD1952D1EF58A3743CA5BD76602140206住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...
1596063AA2D5AFFD768100625F947BA030B48142105一款信息查询辅助应用软件,服务广大福彩彩民,为福彩中心的各种促销活动提供平台支持。主要功能纸...
1596163AB66CD8D27C6B269F4960FB530AA76142104国美金融App简介国美金融App是综合性金融服务平台,致力为个人和企业提供定制化财富管理服务...
1596263AD9FA5338921C66943390ADA5DCF23142102华福证券网上开户2.0系统,采用人脸单向识别技术,优化开户排队机制,可绑定老账户,为您节省宝...
1596363AF8C9C9E16F935F8F424533D24FD40140701软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...
1596463B0E4D5A2319B7684D8959D3703B7C4140404曹雪芹著的经典文学名著《红楼梦》,特殊的幼儿早教方式,让您的孩子赢在起跑线上,大人和儿童皆宜...
1596563B1568A4BA00BB36247F3FE7E63D046140210大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来,锻炼自己,让自己成为万人瞩目的象棋大师...
1596663B5F7FD3037C633611E405BF76357A6140901流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...
1596763B6A2A65E22AB3BEF8E6E3627058005140901三星应用商店GlxyApps是三星官方开发和运营的应用下载平台,拥有数十万款应用、游戏和主题...
1596863B8474AE7D557EB69107C1C8D67293B140212海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权,对比经典抢...
1596963B89ACCF7E7BB4E8048A4430A61198E140603《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...
1597063BA67638DB01DFD3BE2B89A6DA9C632140802乌鲁木齐地铁官方APP,为您提供全新出行方式。地铁购票&mdash;&mdash;乘客可通过...
1597163BA6BE8E50C34BC6C08F39487BF3063140404产品简介免费追书吧,一款专业免费的电子书阅读软件,爱阅读的小伙伴千万不要错过。全本小说免费阅...
1597263BBCDE7DE3AE668D03FAB004E986F4F140301|140604人人是一个火爆校园的高颜值网红美女视频交友直播平台,同城交友约会聊天,明星艺人在线直播,72...
1597363BD79538A92F05644BE6AD23D87B545140603铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声,轻松...
1597463BF35D999C3B21BB0E783CD56FD60D0140207《侍灵》是一款日式暗黑系的横版格斗QTE手游,在动作游戏领域中做出了大胆的创新,通过策略搭配...
1597563BFFE1204509BBA9BD9E0E406FB2A38142103风云管家,信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...
1597663C0F5069E829510104C56911CF571D1140207两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...
1597763C5FA30A92F3B99258FA6085EE90D91141201通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...
1597863CA760775B2CD3D62995F657568CC8E141001宝宝学加减法,是宝宝巴士专为5+宝贝设计的数学软件,让孩子轻松学习加减法,赢在起跑线~难度贴...
1597963CB103A546C380870C8A3FA53A14208140113长安通APP是一款便民生活服务软件。用户使用长安通APP,可以实现长安通卡NFC充值、查询余...
\n", 580 | "
" 581 | ], 582 | "text/plain": [ 583 | " app_id type_id \\\n", 584 | "15945 63959834D8FB9D68C03A75C9BB0906EA 140206 \n", 585 | "15946 63961F67B88D3D7D877101F80A53E5CD 140901 \n", 586 | "15947 6396C70B6383F0BF243EF69927ACF35F 140901 \n", 587 | "15948 6396F4C27E1F1D86762B9283D701DB78 142501 \n", 588 | "15949 63997AB7F3E277BC0CB1D42C3D8360F4 142103 \n", 589 | "15950 639B889103E0AFD7D23E8C593DB6A6D1 140211 \n", 590 | "15951 639BC48DB51B5806B726B392224F0CA8 142102 \n", 591 | "15952 639C08D6CA2142E0CFD60E64DFB7C326 140901 \n", 592 | "15953 639C9663BB3CABFA048B3A54ED9B8CC9 140401 \n", 593 | "15954 639DBC25084151D681F73C1A331B6CBA 140210 \n", 594 | "15955 63A0262B4C6D416DC2816B15E716C31D 142103 \n", 595 | "15956 63A09CBD3873CE47A42BC285705B8431 140901 \n", 596 | "15957 63A0C80FD1F955F8C53AFE69291EC652 140107 \n", 597 | "15958 63A2BCCA93BB9ED948A2892CFEF4CFCE 140207 \n", 598 | "15959 63A9AFD1952D1EF58A3743CA5BD76602 140206 \n", 599 | "15960 63AA2D5AFFD768100625F947BA030B48 142105 \n", 600 | "15961 63AB66CD8D27C6B269F4960FB530AA76 142104 \n", 601 | "15962 63AD9FA5338921C66943390ADA5DCF23 142102 \n", 602 | "15963 63AF8C9C9E16F935F8F424533D24FD40 140701 \n", 603 | "15964 63B0E4D5A2319B7684D8959D3703B7C4 140404 \n", 604 | "15965 63B1568A4BA00BB36247F3FE7E63D046 140210 \n", 605 | "15966 63B5F7FD3037C633611E405BF76357A6 140901 \n", 606 | "15967 63B6A2A65E22AB3BEF8E6E3627058005 140901 \n", 607 | "15968 63B8474AE7D557EB69107C1C8D67293B 140212 \n", 608 | "15969 63B89ACCF7E7BB4E8048A4430A61198E 140603 \n", 609 | "15970 63BA67638DB01DFD3BE2B89A6DA9C632 140802 \n", 610 | "15971 63BA6BE8E50C34BC6C08F39487BF3063 140404 \n", 611 | "15972 63BBCDE7DE3AE668D03FAB004E986F4F 140301|140604 \n", 612 | "15973 63BD79538A92F05644BE6AD23D87B545 140603 \n", 613 | "15974 63BF35D999C3B21BB0E783CD56FD60D0 140207 \n", 614 | "15975 63BFFE1204509BBA9BD9E0E406FB2A38 142103 \n", 615 | "15976 63C0F5069E829510104C56911CF571D1 140207 \n", 616 | "15977 63C5FA30A92F3B99258FA6085EE90D91 141201 \n", 617 | "15978 63CA760775B2CD3D62995F657568CC8E 141001 \n", 618 | "15979 63CB103A546C380870C8A3FA53A14208 140113 \n", 619 | "\n", 620 | " app_des \n", 621 | "15945 在全新的地图中,你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团,肩负起守卫天空之城的重任,同... \n", 622 | "15946 部分小错误,整体。 \n", 623 | "15947 以太大陆EthMin 以太大陆是一个数字生态世界,帮助个体管理在现实世界中所付出的努力与贡献... \n", 624 | "15948 帮助准妈妈在分娩前记录宫缩频率和时长,以判断是否达到就医标准。遇到问题,可1对1在线咨询产科... \n", 625 | "15949 线上线下优势资源整合  不必四处奔波,专属咨询顾问为您服务。  安心快速无抵押  去繁求简,... \n", 626 | "15950 更稳定、更优质,邀您一起。 \n", 627 | "15951 金钱永不眠一个股票账户,一笔钱投资美股/港股/英股/A股;全球资产配置的一站式股票平台享受一... \n", 628 | "15952 文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成,一款专业进行文字转语... \n", 629 | "15953 在微博,官方发布新闻,草根爆料八卦;在微博,大V明星发布动态,粉丝狗仔爆料内幕;在微博,海量... \n", 630 | "15954 比斗地主麻将更简单、比炸金花牛牛更刺激,全球百姓共同推荐锻炼情商智商,大奖话费信手拈来。客官... \n", 631 | "15955 \"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包,帮您闪电周转... \n", 632 | "15956 这是一款优客工场官方发布的移动客户端,涵盖了优客工场支持的所有辅助功能,旨在为你和你的团队提... \n", 633 | "15957 天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒... \n", 634 | "15958 喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主... \n", 635 | "15959 住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm... \n", 636 | "15960 一款信息查询辅助应用软件,服务广大福彩彩民,为福彩中心的各种促销活动提供平台支持。主要功能纸... \n", 637 | "15961 国美金融App简介国美金融App是综合性金融服务平台,致力为个人和企业提供定制化财富管理服务... \n", 638 | "15962 华福证券网上开户2.0系统,采用人脸单向识别技术,优化开户排队机制,可绑定老账户,为您节省宝... \n", 639 | "15963 软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900... \n", 640 | "15964 曹雪芹著的经典文学名著《红楼梦》,特殊的幼儿早教方式,让您的孩子赢在起跑线上,大人和儿童皆宜... \n", 641 | "15965 大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来,锻炼自己,让自己成为万人瞩目的象棋大师... \n", 642 | "15966 流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版... \n", 643 | "15967 三星应用商店GlxyApps是三星官方开发和运营的应用下载平台,拥有数十万款应用、游戏和主题... \n", 644 | "15968 海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权,对比经典抢... \n", 645 | "15969 《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一... \n", 646 | "15970 乌鲁木齐地铁官方APP,为您提供全新出行方式。地铁购票——乘客可通过... \n", 647 | "15971 产品简介免费追书吧,一款专业免费的电子书阅读软件,爱阅读的小伙伴千万不要错过。全本小说免费阅... \n", 648 | "15972 人人是一个火爆校园的高颜值网红美女视频交友直播平台,同城交友约会聊天,明星艺人在线直播,72... \n", 649 | "15973 铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声,轻松... \n", 650 | "15974 《侍灵》是一款日式暗黑系的横版格斗QTE手游,在动作游戏领域中做出了大胆的创新,通过策略搭配... \n", 651 | "15975 风云管家,信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮... \n", 652 | "15976 两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注... \n", 653 | "15977 通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引... \n", 654 | "15978 宝宝学加减法,是宝宝巴士专为5+宝贝设计的数学软件,让孩子轻松学习加减法,赢在起跑线~难度贴... \n", 655 | "15979 长安通APP是一款便民生活服务软件。用户使用长安通APP,可以实现长安通卡NFC充值、查询余... " 656 | ] 657 | }, 658 | "execution_count": 66, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "apptype_train.iloc[15945:15980]" 665 | ] 666 | } 667 | ], 668 | "metadata": { 669 | "kernelspec": { 670 | "display_name": "Python 3", 671 | "language": "python", 672 | "name": "python3" 673 | }, 674 | "language_info": { 675 | "codemirror_mode": { 676 | "name": "ipython", 677 | "version": 3 678 | }, 679 | "file_extension": ".py", 680 | "mimetype": "text/x-python", 681 | "name": "python", 682 | "nbconvert_exporter": "python", 683 | "pygments_lexer": "ipython3", 684 | "version": "3.6.8" 685 | } 686 | }, 687 | "nbformat": 4, 688 | "nbformat_minor": 2 689 | } 690 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | # 数据的准备 3 | 需要下载bert的预训练模型,然后在conf中修改对应的变量pretrained_path 4 | 5 | 下载地址: https://docs.google.com/uc?export=download&id=1W3WgPJWGVKlU9wpUYsdZuurAIFKvrl_Y 6 | 7 | 8 | # 运行方式 9 | ## 环境准备 10 | * 依赖包的安装 11 | 12 | pip install -r requirements.txt 13 | 14 | * GPU的指定 15 | 16 | 修改bert.py中的如下代码 17 | 18 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 19 | 20 | * OOF的生成: BERT 21 | 22 | 以5折的方式运行程序,并且每一折都运行3次 23 | 24 | nohup ./bin/main.sh 3 & 25 | 26 | 27 | * 融合OOF: 对生成的OOF合并生成提交文件: 28 | 29 | nohup python -u ./core/ensemble_new.py main >> ensemble_final.log 2>&1 & 30 | 31 | * 生成的提交文件可见于 ./output/sub/ 32 | 33 | 34 | # 数据的爬取 35 | * 百度 36 | * 应用宝 37 | * 豌豆荚 38 | * 小米应用商店 39 | 40 | # 数据清洗 41 | - 异常数据的发现 42 | 43 | - appname 为 #ffababab 44 | ffababab com.hisense.uifac 45 | FACTORY MENU com.hisense.uifac 46 | 工厂菜单 com.hisense.uifac 47 | 48 | - appname 为 WXEntryActivity (138个) 49 | WXEntryActivity com.tencent.ft 50 | 妖精的尾巴:魔导少年 com.tencent.ft 51 | 52 | - 百度手机助手(*) 53 | 百度手机助手(米聊) 54 | 百度手机助手(平安金管家) 55 | 百度手机助手(掌上电力) 56 | 57 | 58 | 59 | - 繁体字 60 | 所有的繁体字转简体字 61 | 62 | 63 | # 数据 64 | - 数据增强(切割) 65 | 66 | 由于bert等模型,都对输入的seq_len有字数限制. 余下的数据只能浪费.数据增强的一点是对浪费的数据进行利用. 67 | 68 | 数据seq_len不是针对字符串的长度,而是bert对输入转ID之后的长度,否则对英文的影响是巨大的. 69 | 70 | 具体源码可以参考:get_feature_bert(seq_len) 71 | 72 | - 数据增强(不同源) 73 | 74 | 由于第二期,是自行爬取数据,往往对于一条数据可以从多个源爬取数据.如果只选择一个源,则会对数据进行另外的一种浪费 75 | 76 | 对数据进行选择或者融合,需要对同一条数据查询到的多个结果根据下列因素,赋予不同权重 77 | 1) input appname 是否等于 output appname 78 | 2) input appnane 是否包含于 output appname 79 | 3) input pkg 是否等于 output pkg 80 | 4) 返回结果的长度 81 | 5) Appname 是否出现在返回结果中 82 | 6) 数据源的不同, 百度, 豌豆荚, 小米应用商店, 应用宝 83 | 84 | 85 | 86 | 87 | - 过拟合的避免 88 | 89 | 数据增强后, 如果同一源头的数据如果切分到了训练集和验证集,会造成本地分数虚高.所以需要保证同源数据分到同一fold 90 | 91 | 92 | # 算法 93 | - 不同seq_len 94 | 95 | 不同的seq_len 会有不同的表现,可以择优保留,然后融合 96 | 97 | - 不同模型 98 | 99 | bert: 由于Bert有字数限制,最长512,并且太长效果并不是特别好.只能对局部信息进行利用 100 | lstm: LSTM对比Bert可以设置较大的训练窗口,利用大部分数据.和bert进行融合,是一个极大的补充. 101 | 102 | - 克服抖动 103 | 104 | 由于利用GPU进行训练,结果往往会有一定的抖动,可以对同一模型训练多轮,择优保留 105 | 106 | 107 | # 后处理/融合 108 | 109 | - 不同数据源 110 | 111 | 数据增强,不仅仅针对训练集, 对测试集也做了对应的增强. 这样,同一条测试集,会有多个预测结果.这样对这些结果进行加权融合会有比较好的结果. 112 | 113 | 114 | - 不同模型 115 | 116 | 同一条测试集,会有多个模型进行预测,然后进行加权融合. 117 | 118 | - 不同的切割序列 119 | 120 | 不同模型,选择不同的seq_len,得到的不同结果,然后融合. 121 | 122 | # 团队成员 123 | 牛张明 124 | 攻城狮 125 | 周青松 126 | 林智敏 127 | 罗宾理 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /readme2.md: -------------------------------------------------------------------------------- 1 | # Initial 2 | 3 | # 腾讯词向量下载地址 4 | 5 | - https://ai.tencent.com/ailab/nlp/data/Tencent_AILab_ChineseEmbedding.tar.gz 6 | ln -s ../../word_vec/Tencent_AILab_ChineseEmbedding.txt Tencent_AILab_ChineseEmbedding.txt 7 | - https://github.com/Embedding/Chinese-Word-Vectors 8 | Zhihu_QA 知乎问答 9 | 10 | # 去除引号 11 | sed -i 's/"//g' *.* 12 | 13 | 14 | # 停用词 15 | https://github.com/goto456/stopwords 16 | 17 | 18 | # Type_id Cnt: 19 | 152 20 | 21 | # Bert 22 | 23 | wget -q https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip 24 | wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 25 | the download model will save to folder: input/model/chinese_L-12_H-768_A-12 26 | 27 | 28 | # 输入目录 29 | 30 | input/ 31 | ├── model 32 | │   ├── chinese_L-12_H-768_A-12 33 | │   │   ├── bert_config.json 34 | │   │   ├── bert_model.ckpt.data-00000-of-00001 35 | │   │   ├── bert_model.ckpt.index 36 | │   │   ├── bert_model.ckpt.meta 37 | │   │   └── vocab.txt 38 | │   └── uncased_L-12_H-768_A-12.zip 39 | ├── Tencent_AILab_ChineseEmbedding.txt 40 | └── zip 41 | ├── app_desc.dat 42 | ├── apptype_id_name.txt 43 | ├── apptype_train.dat 44 | └── mini.dat 45 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | keras-bert==0.57.1 2 | jieba 3 | deprecated 4 | urllib3 5 | file_cache==0.1.36 6 | keras==2.2.4 7 | tables 8 | 9 | #tensorflow==1.11.0 10 | #tensorflow-gpu==1.11.0 11 | 12 | tensorflow==1.13.1 13 | tensorflow-gpu==1.13.1 14 | 15 | -------------------------------------------------------------------------------- /spider/gen_file.py: -------------------------------------------------------------------------------- 1 | from spider.mi import * 2 | import re 3 | from core.conf import check_type_list 4 | 5 | if __name__ == '__main__': 6 | import sys 7 | 8 | print(sys.argv) 9 | 10 | #max_len = sys.argv[1] 11 | 12 | sys.argv=['a','b'] 13 | 14 | final = get_final_feature() 15 | final.desc_name = final.desc_name.str[:4000] 16 | 17 | file = './input/zip/apptype_train.dat_p2' 18 | 19 | train = final.loc[final.type_id.str.len() > 0] 20 | train['id'] = 'valxxx' + train['id'].str[6:] 21 | train.loc[:, ['id', 'type_id', 'desc_name']].to_csv(file, sep='\t', header=None, index=None) 22 | print(f'save {len(train)} rows to {file} ') 23 | 24 | file='./input/zip/app_desc.dat' 25 | test = final.loc[final.type_id.str.len()==0]#.loc[final.type_id.str.len()==0] 26 | test = pd.concat([train, test]) 27 | test.loc[:,['id','desc_name']].to_csv(file, sep='\t',header=None, index=None) 28 | print(f'save {len(test)} rows to {file} ') 29 | 30 | 31 | 32 | 33 | 34 | # 35 | # ####################### 36 | # train_list = [] 37 | # for item in check_type_list: 38 | # if item =='stb': 39 | # continue 40 | # final = get_final_feature(item) 41 | # train = final.loc[final.type_id.str.len() > 0] 42 | # train['id'] = train['id'].apply(lambda val: item + 'x'*(6-len(item)) + val[6:]) 43 | # train_list.append(train) 44 | # 45 | # #Stb part 46 | # stb = pd.read_csv('./input/zip/78_app_desc.dat', sep='\t', header=None) 47 | # stb.columns = ['id', 'desc_name'] 48 | # tmp = get_train_ph2_index() 49 | # stb = stb.loc[stb['id'].isin(tmp['id'])] # .shape 50 | # stb.head() 51 | # stb.id = 'stbxxx' + stb.id.str[6:] 52 | # train_list.append(stb) 53 | # #Stb_end 54 | # 55 | # train = pd.concat(train_list) 56 | # file = f'./input/zip/app_desc.dat' 57 | # 58 | # train.loc[:, ['id', 'desc_name']].to_csv(file, sep='\t', header=None, index=None) 59 | # print(f'save to {file} ') 60 | 61 | """ 62 | python spider/gen_file.py 63 | """ 64 | -------------------------------------------------------------------------------- /tip.md: -------------------------------------------------------------------------------- 1 | 1)LDA 2 | 2)分解 tag, find in desc 3 | 3)辅助特征, 当前相似的词在文中出现的次数 4 | 4)tf-idf (基于type_name) 5 | 5)include(total, partial) 6 | 6)app type_name,分大组 7 | 7)从desc 过滤相似词 8 | 8)n-gram 去除? 9 | 9)手工tocken 10 | 10)解释性 11 | 11)倒序挑选字符串 12 | 13 | 99998 app_desc.dat 14 | 29999 apptype_train.dat 15 | 16 | 1134 records have multiply type_id 17 | 18 | 130097 total 19 | 20 | 21 | 异常app: 22 | 1A23C73F4F3E892E2A9DF3C338B80313 -102 23 | D675211C835694A9F096B3AD3C8A9F79 -102 24 | 59913551A1752422F3B191E0C353309D -102 25 | 6FFCF1564CFA7547DEEEB5DDCC83A24B -102 26 | E2CC4670C695BFD41FC4ABFDE95C7B36 -102 27 | 0C8C840A534F32D8A608F50D67663E83 -102 28 | D396674F43367C4FDEF82CDA78756D4F -102 29 | 1F0AF6FA7424660692173FF4134903CB -102 30 | 31 | #Manual handle 32 | egrep dkplugin input/0823/* 33 | 34 | 35 | no case for 140208 36 | 37 | grep 'com.android.iconnect' ./input/0823/* 38 | 39 | 唐小僧 40 | 41 | WXEntryActivity 42 | 43 | 辽宁和教育 44 | 45 | leak between test and train 46 | 47 | https://android.myapp.com/myapp/detail.htm?apkName=com.kiees.android 48 | vs 49 | https://android.myapp.com/myapp/detail.htm?apkName=com.kiess 50 | 51 | 52 | grep 百度手机助手 ./input/0823/* 53 | 54 | name first place 55 | 56 | name partially match 57 | 58 | 融合,交叉数据源, 去除空格,大小写 一半a, 一半b 59 | 60 | 61 | 62 | #ffababab com.hisense.uifac 63 | FACTORY MENU com.hisense.uifac 64 | 工厂菜单 com.hisense.uifac 65 | 66 | 67 | aiqianjin.jiea 68 | 69 | WXEntryActivity 70 | 71 | 开心躲猫猫 72 | 当妈模拟器 73 | -------------------------------------------------------------------------------- /zhtools/langconv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from copy import deepcopy 5 | import re 6 | 7 | try: 8 | import psyco 9 | psyco.full() 10 | except: 11 | pass 12 | 13 | try: 14 | from zh_wiki import zh2Hant, zh2Hans 15 | except ImportError: 16 | from zhtools.zh_wiki import zh2Hant, zh2Hans 17 | 18 | import sys 19 | py3k = sys.version_info >= (3, 0, 0) 20 | 21 | if py3k: 22 | UEMPTY = '' 23 | else: 24 | _zh2Hant, _zh2Hans = {}, {} 25 | for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)): 26 | for k, v in old.items(): 27 | new[k.decode('utf8')] = v.decode('utf8') 28 | zh2Hant = _zh2Hant 29 | zh2Hans = _zh2Hans 30 | UEMPTY = ''.decode('utf8') 31 | 32 | # states 33 | (START, END, FAIL, WAIT_TAIL) = list(range(4)) 34 | # conditions 35 | (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5)) 36 | 37 | MAPS = {} 38 | 39 | class Node(object): 40 | def __init__(self, from_word, to_word=None, is_tail=True, 41 | have_child=False): 42 | self.from_word = from_word 43 | if to_word is None: 44 | self.to_word = from_word 45 | self.data = (is_tail, have_child, from_word) 46 | self.is_original = True 47 | else: 48 | self.to_word = to_word or from_word 49 | self.data = (is_tail, have_child, to_word) 50 | self.is_original = False 51 | self.is_tail = is_tail 52 | self.have_child = have_child 53 | 54 | def is_original_long_word(self): 55 | return self.is_original and len(self.from_word)>1 56 | 57 | def is_follow(self, chars): 58 | return chars != self.from_word[:-1] 59 | 60 | def __str__(self): 61 | return '' % (repr(self.from_word), 62 | repr(self.to_word), self.is_tail, self.have_child) 63 | 64 | __repr__ = __str__ 65 | 66 | class ConvertMap(object): 67 | def __init__(self, name, mapping=None): 68 | self.name = name 69 | self._map = {} 70 | if mapping: 71 | self.set_convert_map(mapping) 72 | 73 | def set_convert_map(self, mapping): 74 | convert_map = {} 75 | have_child = {} 76 | max_key_length = 0 77 | for key in sorted(mapping.keys()): 78 | if len(key)>1: 79 | for i in range(1, len(key)): 80 | parent_key = key[:i] 81 | have_child[parent_key] = True 82 | have_child[key] = False 83 | max_key_length = max(max_key_length, len(key)) 84 | for key in sorted(have_child.keys()): 85 | convert_map[key] = (key in mapping, have_child[key], 86 | mapping.get(key, UEMPTY)) 87 | self._map = convert_map 88 | self.max_key_length = max_key_length 89 | 90 | def __getitem__(self, k): 91 | try: 92 | is_tail, have_child, to_word = self._map[k] 93 | return Node(k, to_word, is_tail, have_child) 94 | except: 95 | return Node(k) 96 | 97 | def __contains__(self, k): 98 | return k in self._map 99 | 100 | def __len__(self): 101 | return len(self._map) 102 | 103 | class StatesMachineException(Exception): pass 104 | 105 | class StatesMachine(object): 106 | def __init__(self): 107 | self.state = START 108 | self.final = UEMPTY 109 | self.len = 0 110 | self.pool = UEMPTY 111 | 112 | def clone(self, pool): 113 | new = deepcopy(self) 114 | new.state = WAIT_TAIL 115 | new.pool = pool 116 | return new 117 | 118 | def feed(self, char, map): 119 | node = map[self.pool+char] 120 | 121 | if node.have_child: 122 | if node.is_tail: 123 | if node.is_original: 124 | cond = UNMATCHED_SWITCH 125 | else: 126 | cond = MATCHED_SWITCH 127 | else: 128 | cond = CONNECTOR 129 | else: 130 | if node.is_tail: 131 | cond = TAIL 132 | else: 133 | cond = ERROR 134 | 135 | new = None 136 | if cond == ERROR: 137 | self.state = FAIL 138 | elif cond == TAIL: 139 | if self.state == WAIT_TAIL and node.is_original_long_word(): 140 | self.state = FAIL 141 | else: 142 | self.final += node.to_word 143 | self.len += 1 144 | self.pool = UEMPTY 145 | self.state = END 146 | elif self.state == START or self.state == WAIT_TAIL: 147 | if cond == MATCHED_SWITCH: 148 | new = self.clone(node.from_word) 149 | self.final += node.to_word 150 | self.len += 1 151 | self.state = END 152 | self.pool = UEMPTY 153 | elif cond == UNMATCHED_SWITCH or cond == CONNECTOR: 154 | if self.state == START: 155 | new = self.clone(node.from_word) 156 | self.final += node.to_word 157 | self.len += 1 158 | self.state = END 159 | else: 160 | if node.is_follow(self.pool): 161 | self.state = FAIL 162 | else: 163 | self.pool = node.from_word 164 | elif self.state == END: 165 | # END is a new START 166 | self.state = START 167 | new = self.feed(char, map) 168 | elif self.state == FAIL: 169 | raise StatesMachineException('Translate States Machine ' 170 | 'have error with input data %s' % node) 171 | return new 172 | 173 | def __len__(self): 174 | return self.len + 1 175 | 176 | def __str__(self): 177 | return '' % ( 178 | id(self), self.pool, self.state, self.final) 179 | __repr__ = __str__ 180 | 181 | class Converter(object): 182 | def __init__(self, to_encoding): 183 | self.to_encoding = to_encoding 184 | self.map = MAPS[to_encoding] 185 | self.start() 186 | 187 | def feed(self, char): 188 | branches = [] 189 | for fsm in self.machines: 190 | new = fsm.feed(char, self.map) 191 | if new: 192 | branches.append(new) 193 | if branches: 194 | self.machines.extend(branches) 195 | self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] 196 | all_ok = True 197 | for fsm in self.machines: 198 | if fsm.state != END: 199 | all_ok = False 200 | if all_ok: 201 | self._clean() 202 | return self.get_result() 203 | 204 | def _clean(self): 205 | if len(self.machines): 206 | self.machines.sort(key=lambda x: len(x)) 207 | # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) 208 | self.final += self.machines[0].final 209 | self.machines = [StatesMachine()] 210 | 211 | def start(self): 212 | self.machines = [StatesMachine()] 213 | self.final = UEMPTY 214 | 215 | def end(self): 216 | self.machines = [fsm for fsm in self.machines 217 | if fsm.state == FAIL or fsm.state == END] 218 | self._clean() 219 | 220 | def convert(self, string): 221 | self.start() 222 | for char in string: 223 | self.feed(char) 224 | self.end() 225 | return self.get_result() 226 | 227 | def get_result(self): 228 | return self.final 229 | 230 | 231 | def registery(name, mapping): 232 | global MAPS 233 | MAPS[name] = ConvertMap(name, mapping) 234 | 235 | registery('zh-hant', zh2Hant) 236 | registery('zh-hans', zh2Hans) 237 | del zh2Hant, zh2Hans 238 | 239 | 240 | def run(): 241 | import sys 242 | from optparse import OptionParser 243 | parser = OptionParser() 244 | parser.add_option('-e', type='string', dest='encoding', 245 | help='encoding') 246 | parser.add_option('-f', type='string', dest='file_in', 247 | help='input file (- for stdin)') 248 | parser.add_option('-t', type='string', dest='file_out', 249 | help='output file') 250 | (options, args) = parser.parse_args() 251 | if not options.encoding: 252 | parser.error('encoding must be set') 253 | if options.file_in: 254 | if options.file_in == '-': 255 | file_in = sys.stdin 256 | else: 257 | file_in = open(options.file_in) 258 | else: 259 | file_in = sys.stdin 260 | if options.file_out: 261 | if options.file_out == '-': 262 | file_out = sys.stdout 263 | else: 264 | file_out = open(options.file_out, 'wb') 265 | else: 266 | file_out = sys.stdout 267 | 268 | c = Converter(options.encoding) 269 | for line in file_in: 270 | # print >> file_out, c.convert(line.rstrip('\n').decode( 271 | file_out.write(c.convert(line.rstrip('\n').decode( 272 | 'utf8')).encode('utf8')) 273 | 274 | 275 | if __name__ == '__main__': 276 | run() 277 | -------------------------------------------------------------------------------- /zhtools/test.py: -------------------------------------------------------------------------------- 1 | 2 | def Traditional2Simplified(sentence): 3 | ''' 4 | 将sentence中的繁体字转为简体字 5 | :param sentence: 待转换的句子 6 | :return: 将句子中繁体字转换为简体字之后的句子 7 | ''' 8 | from zhtools.langconv import Converter 9 | sentence = Converter('zh-hans').convert(sentence) 10 | return sentence 11 | 12 | if __name__=="__main__": 13 | traditional_sentence = '憂郁的臺灣烏龜,百度地圖' 14 | simplified_sentence = Traditional2Simplified(traditional_sentence) 15 | print(simplified_sentence) 16 | 17 | 18 | -------------------------------------------------------------------------------- /大数据标签-讯飞.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flyfoxs/xf_tag/ee3123f10ff884e46084c5c336b4fa792ad741c1/大数据标签-讯飞.pptx --------------------------------------------------------------------------------