├── .DS_Store
├── .gitignore
├── bin
    ├── bin_0_batch.log
    ├── check.sh
    ├── check_mini.sh
    ├── clean.sh
    ├── compare.sh
    ├── del.sh
    ├── deploy.sh
    ├── deploy1.sh
    ├── deploy38.sh
    ├── download.sh
    ├── exclude.txt
    ├── init.sh
    ├── kill.sh
    ├── main.sh
    ├── main_manual.sh
    ├── merge.sh
    ├── paras.sh
    ├── paras7.sh
    ├── paras8.sh
    ├── predict.sh
    ├── sync.sh
    └── test.sh
├── core
    ├── bert.py
    ├── conf.py
    ├── del.py
    ├── ensemble.py
    ├── ensemble_new.py
    ├── feature.py
    ├── feature_xlnet.py
    ├── mini.py
    ├── split.py
    └── xlnet.py
├── input
    └── readme.txt
├── notebook
    ├── .ipynb_checkpoints
    │   ├── Untitled-checkpoint.ipynb
    │   └── word_analysis_local-checkpoint.ipynb
    ├── Untitled.ipynb
    ├── Untitled1.ipynb
    ├── Untitled2.ipynb
    ├── lstm_best.ipynb
    ├── train_v2.ipynb
    └── word_analysis_local.ipynb
├── readme.md
├── readme2.md
├── requirements.txt
├── spider
    ├── gen_file.py
    └── mi.py
├── tip.md
├── zhtools
    ├── langconv.py
    ├── test.py
    └── zh_wiki.py
└── 大数据标签-讯飞.pptx


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flyfoxs/xf_tag/ee3123f10ff884e46084c5c336b4fa792ad741c1/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | input
2 | .DS_Store
3 | .idea
4 | 


--------------------------------------------------------------------------------
/bin/bin_0_batch.log:
--------------------------------------------------------------------------------
1 | python: can't open file './core/bert.py': [Errno 2] No such file or directory
2 | python: can't open file './core/bert.py': [Errno 2] No such file or directory
3 | python: can't open file './core/bert.py': [Errno 2] No such file or directory
4 | python: can't open file './core/bert.py': [Errno 2] No such file or directory
5 | python: can't open file './core/bert.py': [Errno 2] No such file or directory
6 | 


--------------------------------------------------------------------------------
/bin/check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHONPATH=/users/hdpsbp/bk/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH
 4 | 
 5 | PATH=/apps/dslab/anaconda/python3/bin:$PATH
 6 | 
 7 | 
 8 | 
 9 | #for level in 0.9 1 1.1 1.2 1.4 1.5 0.8
10 | #for level in  1.4 1.5 0.8
11 | for level in  0.75 0.85 0.7
12 | do
13 |     #echo python -u  core/train.py train_ex {} [] {0:$level, 3:$level, 4:$level, 6:$level, 9:$level} #> log/search_$level.log 2>&1
14 |     python -u  core/train.py train_ex {} [] \{0:$level,3:$level,4:$level,6:$level,9:$level\}  > log/search2_$level.log 2>&1
15 | done
16 | 


--------------------------------------------------------------------------------
/bin/check_mini.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHONPATH=/users/hdpsbp/bk/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH
 4 | 
 5 | PATH=/apps/dslab/anaconda/python3/bin:$PATH
 6 | 
 7 | 
 8 | 
 9 | for bin_count in 8 #20
10 | do
11 |     echo $bin_count
12 |     python ./core/check.py -L   --bin_count $bin_count  --gp_name lr_bin_$bin_count     \
13 |                     > log/bin_"$(hostname)"_$bin_count.log 2>&1
14 | done
15 | # nohup ./bin/check_mini.sh 5 &
16 | 


--------------------------------------------------------------------------------
/bin/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | 
 6 | rm ./cache/merge_feature*
 7 | rm ./cache/get_final_feature*
 8 | python spider/gen_file.py  > gen2.log 2>&1
 9 | cat ./input/zip/apptype_train.dat_p* > ./input/zip/apptype_train.dat
10 | #./bin/test.sh
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/bin/compare.sh:
--------------------------------------------------------------------------------
1 | cd "$(dirname "$0")"
2 | 
3 | cd ..
4 | 
5 | #rm -rf cache/get_feature_target*compare*.h5
6 | nohup python -u code_felix/core/compare.py >> compare_"$(hostname)".log 2>&1 &
7 | 


--------------------------------------------------------------------------------
/bin/del.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | 
 6 | 
 7 | for fold in 2 2 2
 8 | do
 9 |     python -u ./core/bert.py --fold=${fold} train_base  >> fold_${fold}_"$(hostname)".log 2>&1
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/bin/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | 
 4 | #remote_host="aladdin1@dgx"
 5 | remote_host="aladdin1@$1"
 6 | remote_dir="~/felix/"
 7 | 
 8 | cd ..
 9 | 
10 | if [[ -z "$2" ]]; then
11 |     rsync -avz --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir
12 | else
13 |     rsync -avz $(pwd) $remote_host:$remote_dir
14 | fi
15 | 
16 | date
17 | 
18 | echo 'upload to:' $remote_host:$remote_dir
19 | echo '===================================='
20 | 
21 | #rsync -av  ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/
22 | 
23 | 
24 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd   /apps/
25 | 
26 | 
27 | #rsync -av  ./input/tmp hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/input
28 | 
29 | #rsync -av   ./output/sub/?.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/output/sub


--------------------------------------------------------------------------------
/bin/deploy1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | 
 4 | remote_host="root@vm-docker-1" #hdpsbp@ai-prd-04
 5 | remote_dir="/apps/felix/" #/users/hdpsbp/felix/
 6 | 
 7 | cd ..
 8 | 
 9 | if [[ -z "$1" ]]; then
10 |     rsync -av --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir
11 | else
12 |     rsync -av $(pwd) $remote_host:$remote_dir
13 | fi
14 | 
15 | date
16 | 
17 | echo $remote_host:$remote_dir
18 | 
19 | #rsync -av  ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/
20 | 
21 | 
22 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd   /apps/


--------------------------------------------------------------------------------
/bin/deploy38.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | 
 4 | remote_host="aladdin1@10.10.20.38"
 5 | remote_dir="/home/aladdin1/felix"
 6 | 
 7 | cd ..
 8 | 
 9 | if [[ -z "$1" ]]; then
10 |     rsync -avz --exclude-from './bin/exclude.txt' $(pwd) $remote_host:$remote_dir
11 | else
12 |     rsync -avz $(pwd) $remote_host:$remote_dir
13 | fi
14 | 
15 | date
16 | 
17 | echo $remote_host:$remote_dir
18 | 
19 | #rsync -av  ./output/0.70180553000.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/df_jf/output/
20 | 
21 | 
22 | #rsync -av hdpsbp@ai-prd-04:/users/hdpsbp/felix/kdd_bd   /apps/
23 | 
24 | 
25 | #rsync -av  ./input/tmp hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/input
26 | 
27 | #rsync -av   ./output/sub/?.csv hdpsbp@ai-prd-07:/users/hdpsbp/felix/kdd_bd/output/sub
28 | 
29 | 


--------------------------------------------------------------------------------
/bin/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | cd ..
 5 | 
 6 | ./bin/deploy.sh $1
 7 | 
 8 | remote_host="aladdin1@$1"
 9 | remote_dir="~/felix/$(basename "$(pwd)")/*"
10 | 
11 | 
12 | if [[ -z "$2" ]]; then
13 |     rsync -avz --exclude-from './bin/exclude.txt' --max-size=1m  $remote_host:$remote_dir  ./
14 | else
15 |     rsync -avz --max-size=1m  $remote_host:$remote_dir  ./
16 | fi
17 | 
18 | date
19 | 
20 | echo 'download from:' $remote_host:$remote_dir
21 | 


--------------------------------------------------------------------------------
/bin/exclude.txt:
--------------------------------------------------------------------------------
 1 | .git
 2 | *.zip
 3 | .idea
 4 | **/*.pyc
 5 | **/*.h5
 6 | **/*.json
 7 | **/__pycache__
 8 | **/*.log
 9 | **/*.out
10 | **/*.h5
11 | **/*.hdf5
12 | **/*.pkl
13 | **/*.hdf5
14 | **/.DS_Store
15 | **/*.doc*
16 | **/.ipynb_checkpoints
17 | **/log
18 | **/logs
19 | **/nohup.out
20 | **/*.log
21 | output
22 | cache
23 | sub
24 | score
25 | *.pickle
26 | 
27 | #input
28 | **/*.csv
29 | **/Tencent*
30 | #**/config*.py
31 | **/*.dat
32 | 
33 | **/*remote.ipynb
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/bin/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | 
 6 | mkdir -p input
 7 | mkdir -p output/sub
 8 | mkdir -p output/stacking
 9 | mkdir -p output/model
10 | mkdir -p notebook
11 | mkdir -p core
12 | 
13 | 
14 | touch ./core/conf.py
15 | touch ./core/feature.py
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/bin/kill.sh:
--------------------------------------------------------------------------------
1 | kill $(pidof python)


--------------------------------------------------------------------------------
/bin/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | ##for i in {1..10};
 6 | #for i in $(seq 0 $1)
 7 | 
 8 | export PYTHONPATH=./:$PYTHONPATH
 9 | #if not input $1, default value is 100
10 | for i in $(seq 0 ${1:-100})
11 | do
12 |     for fold in {0..4};
13 |     do
14 |         python -u ./core/bert.py --fold=${fold} --batch_size=8 train_base  >> fold_${fold}_"$(hostname)".log 2>&1
15 | 
16 |     done
17 | done
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/bin/main_manual.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | for i in {1..100};
 6 | do
 7 |     echo $i
 8 |     python -u ./core/bert_manual.py train_base  >> manual_batch_bin_0.log 2>&1
 9 | done
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/bin/merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #PYTHONPATH=/users/hdpsbp/HadoopDir/felix/df_jf:/users/hdpsbp/felix/keras:$PYTHONPATH
 4 | 
 5 | #PATH=/apps/dslab/anaconda/python3/bin:$PATH
 6 | 
 7 | 
 8 | 
 9 | #rm -rf ./output/blocks/*.csv
10 | 
11 | python ./core/merge.py > merge.log 2>&1
12 | 


--------------------------------------------------------------------------------
/bin/paras.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | if [ $1 = 'v15' ]
 6 | then
 7 |     echo '15'
 8 |     version=v15
 9 |     max_bin=0
10 |     cut_ratio=0.1
11 |     min_len_ratio=0.8
12 |     echo ./cache/${version}*.*
13 |     rm -rf ./cache/${version}*.*
14 |     mv ./output/stacking/${version}*.* ./output/bk_stacking/
15 |     for fold in {0..4};
16 |     do
17 |         python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
18 |     done
19 | 
20 | elif [ $1 = 'v17' ]
21 | then
22 |     echo '17'
23 |     version=v17
24 |     max_bin=2
25 |     cut_ratio=0.1
26 |     min_len_ratio=0.9
27 |     echo ./cache/${version}*.*
28 |     rm -rf ./cache/${version}*.*
29 |     mv ./output/stacking/${version}*.* ./output/bk_stacking/
30 |     for fold in {0..4};
31 |     do
32 |         python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
33 |     done
34 | 
35 | fi
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/bin/paras7.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | version=v3
 6 | max_bin=1
 7 | cut_ratio=0.15
 8 | min_len_ratio=0.8
 9 | echo ./cache/${version}*.*
10 | rm -rf ./cache/${version}*.*
11 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
12 | for fold in {0..4};
13 | do
14 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
15 | done
16 | 
17 | version=v4
18 | max_bin=1
19 | cut_ratio=0.2
20 | min_len_ratio=0.8
21 | echo ./cache/${version}*.*
22 | rm -rf ./cache/${version}*.*
23 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
24 | for fold in {0..4};
25 | do
26 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
27 | done
28 | 
29 | 
30 | version=v5
31 | max_bin=2
32 | cut_ratio=0.1
33 | min_len_ratio=0.8
34 | echo ./cache/${version}*.*
35 | rm -rf ./cache/${version}*.*
36 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
37 | for fold in {0..4};
38 | do
39 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
40 | done
41 | 
42 | 


--------------------------------------------------------------------------------
/bin/paras8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | version=v6
 6 | max_bin=1
 7 | cut_ratio=0.1
 8 | min_len_ratio=0.7
 9 | echo ./cache/${version}*.*
10 | rm -rf ./cache/${version}*.*
11 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
12 | for fold in {0..4};
13 | do
14 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
15 | done
16 | 
17 | version=v7
18 | max_bin=1
19 | cut_ratio=0.1
20 | min_len_ratio=0.9
21 | echo ./cache/${version}*.*
22 | rm -rf ./cache/${version}*.*
23 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
24 | for fold in {0..4};
25 | do
26 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
27 | done
28 | 
29 | 
30 | 
31 | version=v8
32 | max_bin=3
33 | cut_ratio=0.1
34 | min_len_ratio=0.8
35 | echo ./cache/${version}*.*
36 | rm -rf ./cache/${version}*.*
37 | mv ./output/stacking/${version}*.* ./output/bk_stacking/
38 | for fold in {0..4};
39 | do
40 |     python -u ./core/bert.py  --fold=${fold} --version=${version}  --max_bin=${max_bin} --cut_ratio=${cut_ratio} --min_len_ratio=${min_len_ratio} train_base  >> ${version}_fold_${fold}_"$(hostname)".log 2>&1
41 | done
42 | 
43 | 


--------------------------------------------------------------------------------
/bin/predict.sh:
--------------------------------------------------------------------------------
 1 | cd "$(dirname "$0")"
 2 | 
 3 | cd ..
 4 | best_arg="./imp/best_arg.h5"
 5 | if [ -f "$best_arg" ]; then
 6 |     echo "Already have best args in $best_arg"
 7 | else
 8 |     #生成当前最优参数,存放于目录 ./imp/
 9 |     echo "Try go take_snapshotf for best args, and save in $best_arg"
10 |     python ./core/check.py  > snap_args.log 2>&1
11 | fi
12 | 
13 | #提前对一些分析数据准备好本地缓存
14 | python ./core/feature.py   > feature_prepare.log 2>&1
15 | 
16 | #根据生成的最优参数,预测缺少数据
17 | python ./core/merge.py  --genfile > predict_block.log 2>&1
18 | 
19 | 


--------------------------------------------------------------------------------
/bin/sync.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd "$(dirname "$0")"
 3 | 
 4 | cd ..
 5 | 
 6 | 
 7 | rsync -av ./score/blks/ hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks
 8 | rsync -av hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks ./score/blks/
 9 | 
10 | 
11 | rsync -av ./output/blocks/ hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/score/blks
12 | rsync -av hdpsbp@ai-prd-05:/users/hdpsbp/felix/kdd_bd ./output/blocks/
13 | 
14 | 
15 | rsync -av  hdpsbp@ai-prd-05:/users/hdpsbp/felix/df_jf/imp  ./
16 | 
17 | date
18 | 
19 | 


--------------------------------------------------------------------------------
/bin/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | cd ..
 4 | 
 5 | #for xx in {0..2};
 6 | #do
 7 | #    python -u  spider/mi.py bd  > bd.log 2>&1
 8 | #    python -u  spider/mi.py wdj  > wdj.log 2>&1
 9 | #    python -u  spider/mi.py xm  >  xm.log 2>&1
10 | #
11 | #    python -u  spider/mi.py tx_pkg  >  tx_pkg.log 2>&1
12 | #    python -u  spider/mi.py tx_name  >  tx_name.log 2>&1
13 | #
14 | #    python spider/gen_file.py > gen_file.log 2>&1
15 | #done
16 | 
17 | for xx in {0..2};
18 | do
19 |     for fold in 4 3 2 1 0
20 |     do
21 |         python -u ./core/bert.py --fold=${fold} train_base  >> fold_${fold}_"$(hostname)".log 2>&1
22 |     done
23 | 
24 | done
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/core/bert.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from multiprocessing import Process
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | from core.feature import *
  9 | from core.conf import *
 10 | 
 11 | import os
 12 | 
 13 | #os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 14 | os.environ['TF_KERAS'] = '1'
 15 | 
 16 | oof_prefix = get_args().version
 17 | SEQ_LEN = get_args().seq_len  #randrange(128, 180) #-randrange(0, 5)*8
 18 | BATCH_SIZE = get_args().batch_size
 19 | 
 20 | #Batch size, MAX_len+ex_length, Manual, Manual GP feature cnt, frac
 21 | @lru_cache()
 22 | @timed()
 23 | def get_train_test_bert():
 24 | 
 25 |     frac = get_args().frac
 26 |     max_bin = get_args().max_bin
 27 |     min_len = int(SEQ_LEN*get_args().min_len_ratio)
 28 | 
 29 |     data = get_feature_bert(SEQ_LEN)
 30 | 
 31 |     #Keep all the bin group, if it's test data
 32 |     data = data.loc[(data.bin<=max_bin) | (pd.isna(data.type_id))]
 33 | 
 34 |     with timed_bolck(f'Remove gan data, and len is less then {min_len}'):
 35 |         data = data.loc[ (data.bin == 0) | (data['len_'] >= min_len) ]
 36 |         logger.info(f'Train max_bin:{max_bin},Total Bin distribution:\n{data.bin.value_counts().sort_index()}')
 37 | 
 38 |     data = data.sort_index()
 39 |     logger.info(f'Head of the data:\n, {data.iloc[:3,:3]}')
 40 | 
 41 |     train_data = data.loc[pd.notna(data.type_id)].sample(frac=frac, random_state=2019)
 42 |     labels = train_data.type_id.values.tolist()
 43 |     logger.info(f'Train Bin distribution:\n{train_data.bin.value_counts().sort_index()}')
 44 | 
 45 |     test_data =  data.loc[pd.isna(data.type_id)].sample(frac=1, random_state=2019)
 46 | 
 47 |     trial = get_args().trial
 48 |     logger.info(f'Test Bin distribution#{trial}:\n{test_data.bin.value_counts().sort_index()}')
 49 | 
 50 |     if trial > 0:
 51 |         test_data = test_data.loc[test_data.index.str[-1]=='0']
 52 | 
 53 | 
 54 |     logger.info(f'Train:{train_data.shape} Test#{trial}:{test_data.shape}, frac:{frac}')
 55 | 
 56 |     feature_col = [col for col in data.columns if col.startswith('fea_') or col.startswith('bert_')]
 57 | 
 58 |     label2id, id2label = get_label_id()
 59 |     #word2id = get_word2id()
 60 | 
 61 |     # Encode input words and labels
 62 |     X = train_data.loc[:, feature_col]
 63 |     Y = [label2id[label] for label in labels]
 64 | 
 65 | 
 66 |     X_test = test_data.loc[:, feature_col]
 67 | 
 68 | 
 69 |     return  X, pd.Series(Y, index=train_data.index), X_test
 70 | 
 71 | 
 72 | # X, y, X_test = get_train_test_bert(0.1)
 73 | #
 74 | #
 75 | # train_x, train_y = load_data(train_path)
 76 | # test_x, test_y = load_data(test_path)
 77 | 
 78 | def boost_train(boost=10):
 79 |     for _ in range(boost):
 80 |         p = Process(target=train_base)
 81 |         p.start()
 82 |         p.join()
 83 | 
 84 | 
 85 | @timed()
 86 | def filter_short_desc(X, y):
 87 |     X = X.copy().reset_index()
 88 |     bert_cols = [col for col in X.columns if str(col).startswith('bert_')]
 89 |     bert = X.loc[:, bert_cols]
 90 |     bert_len = bert.where(bert > 0).count(axis=1)
 91 |     old_len = len(bert_len)
 92 |     min_len = int(SEQ_LEN*get_args().min_len_ratio)
 93 |     bert_len = bert_len.loc[bert_len >= min_len]
 94 |     logger.info(f'Filter {old_len - len(bert_len)} records from {old_len} by threshold {min_len}')
 95 | 
 96 |     return X.iloc[bert_len.index], y[bert_len.index]
 97 | 
 98 | 
 99 | @timed()
100 | def train_base():
101 |     args = get_args()
102 |     #frac = args.frac
103 |     fold = args.fold
104 |     EPOCHS = args.epochs
105 | 
106 | 
107 |     LR = 2e-5
108 | 
109 |     with timed_bolck(f'Prepare train data#{BATCH_SIZE}, LR:{LR}'):
110 |         X, y, _ = get_train_test_bert()
111 | 
112 |         ##Begin to define model
113 |         from keras_bert import load_trained_model_from_checkpoint
114 | 
115 |         logger.info(f'Start to train base on checkpoint:{config_path}')
116 |         bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path,  seq_len=SEQ_LEN, )
117 | 
118 |         for l in bert_model.layers:
119 |             l.trainable = True
120 |         from tensorflow.python import keras
121 |         #from keras_bert import  calc_train_steps
122 | 
123 |         x1_in = keras.layers.Input(shape=(None,))
124 |         x2_in = keras.layers.Input(shape=(None,))
125 | 
126 |         x = bert_model([x1_in, x2_in])
127 | 
128 |         x = keras.layers.Lambda(lambda x: x[:, 0])(x)
129 | 
130 |         p = keras.layers.Dense(num_classes, activation='sigmoid')(x)
131 | 
132 |         #from keras import Model
133 |         model = keras.models.Model([x1_in, x2_in], p)
134 | 
135 | 
136 |         model.compile(
137 |             optimizer=keras.optimizers.Adam(lr=LR), # AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
138 |             loss='categorical_crossentropy',
139 |             metrics=['accuracy'],
140 |         )
141 |         model.summary()
142 |         ##End to define model
143 | 
144 |         input1_col = [col for col in X.columns if str(col).startswith('bert_')]
145 |         input2_col = [col for col in X.columns if str(col).startswith('fea_')]
146 |         #max_words = len(input1_col)
147 |         model #= get_model(max_words)
148 | 
149 |         #get_feature_manual.cache_clear()
150 |         Y_cat = keras.utils.to_categorical(y, num_classes=num_classes)
151 |         #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
152 | 
153 |     with timed_bolck(f'Training#{fold}'):
154 |         from core.split import split_df_by_index
155 |         train_idx, test_idx = split_df_by_index(X,fold)
156 | 
157 |         logger.info(f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}')
158 |         train_x, train_y, val_x, val_y = \
159 |             X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx]
160 | 
161 |         logger.info(f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ')
162 | 
163 |         #train_x, train_y = filter_short_desc(train_x, train_y)
164 | 
165 |         input1 = train_x.loc[:, input1_col]#.astype(np.float32)
166 |         input2 = np.zeros_like(input1)#.astype(np.int8)
167 | 
168 |         logger.info(f'NN train_x:{train_x[:3]}')
169 |         min_len_ratio = get_args().min_len_ratio
170 |         max_bin = get_args().max_bin
171 |         logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}, SEQ_LEN:{SEQ_LEN}, min_len_ratio:{min_len_ratio}, bin:{max_bin} ')
172 | 
173 |         from keras_bert import get_custom_objects
174 |         import tensorflow as tf
175 |         with tf.keras.utils.custom_object_scope(get_custom_objects()):
176 |             his = model.fit([input1, input2], train_y,
177 |                             validation_data = ([val_x.loc[:, input1_col], np.zeros_like(val_x.loc[:, input1_col])], val_y),
178 |                             epochs=EPOCHS,  shuffle=True, batch_size=BATCH_SIZE,
179 |                             callbacks=[Cal_acc( val_x, y.iloc[test_idx] )]
180 |                       #steps_per_epoch=1000, validation_steps=10
181 |                       )
182 | 
183 | 
184 | 
185 |             #gen_sub(model, X_test, sn)
186 | 
187 |     return his
188 | 
189 | from tensorflow.python.keras.callbacks import Callback
190 | class Cal_acc(Callback):
191 | 
192 |     def __init__(self, val_x, y):
193 |         super(Cal_acc, self).__init__()
194 |         self.val_x , self.y = val_x, y
195 |         self.min_len = int(SEQ_LEN*get_args().min_len_ratio)
196 |         self.max_bin = get_args().max_bin
197 |         self.fold = get_args().fold
198 |         self.threshold = 0
199 |         self.feature_len = self.val_x.shape[1]
200 |         self.cur_epoch = 0
201 |         self.version = get_args().version
202 |         self.trial = get_args().trial
203 | 
204 |         self.max_score = 0
205 | 
206 |         self.score_list = np.zeros(get_args().epochs)
207 |         self.gen_file = False
208 | 
209 |         import time, os
210 |         self.batch_id = round(time.time())
211 |         self.model_folder = f'./output/model/{self.batch_id}/'
212 | 
213 |         os.makedirs(self.model_folder)
214 | 
215 | 
216 |         #logger.info(f'Cal_acc base on X:{self.X.shape}, Y:{self.y.shape}')
217 | 
218 |     #@timed()
219 |     def cal_acc(self):
220 |         input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')]
221 |         #input2_col = [col for col in self.val_x.columns if str(col).startswith('fea_')]
222 |         #model = self.model
223 |         tmp_val = self.val_x.loc[:,input1_col]
224 |         tmp_y = self.y
225 |         val = self.model.predict([tmp_val, np.zeros_like(tmp_val)])
226 | 
227 |         label2id, id2label = get_label_id()
228 |         val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index)
229 |         val['label'] = tmp_y.astype(int).replace(id2label).astype(int)
230 |         val['bin'] = pd.Series(val.index).str[-1].values.astype(int)
231 |         #logger.info(f'Head val#label:\n{val.label.head()}')
232 |         res_val = val.copy()
233 |         # res_val.to_pickle(f'./output/tmp_res_val.pkl')
234 |         # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl')
235 | 
236 |         num_labels = 10
237 |         df_score = val.loc[val.bin==0]
238 |         score_list = accuracy(df_score, num_labels, f'no{self.cur_epoch},b{self.max_bin},{self.version}')
239 | 
240 |         logger.info(f'{len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}')
241 | 
242 |         return score_list,res_val
243 | 
244 |     @timed()
245 |     def cal_acc_ex(self):
246 |         input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')]
247 | 
248 |         if self.trial==0:
249 |             check_type_list =['val']
250 |         for type_ in tqdm(check_type_list,desc='cal_acc_ex'):
251 |             tmp_val ,tmp_y = self.get_tmp_val_test(type_)
252 |             tmp_val = tmp_val.loc[:, input1_col]
253 | 
254 |             val = self.model.predict([tmp_val, np.zeros_like(tmp_val)])
255 | 
256 |             label2id, id2label = get_label_id()
257 |             val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index)
258 |             val['label'] = tmp_y.astype(int).replace(id2label).astype(int)
259 |             val['bin'] = pd.Series(val.index).str[-1].values.astype(int)
260 |             # logger.info(f'Head val#label:\n{val.label.head()}')
261 |             res_val = val.copy()
262 |             # res_val.to_pickle(f'./output/tmp_res_val.pkl')
263 |             # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl')
264 | 
265 |             num_labels = 10
266 |             df_score = val.loc[val.bin == 0]
267 |             score_list = accuracy(df_score, num_labels, f'ex{self.cur_epoch},{self.version},b{self.max_bin},{type_}')
268 | 
269 |             logger.info(f'===cal_acc_ex{self.cur_epoch}:{type_}==={len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}')
270 | 
271 |         return score_list, res_val
272 | 
273 | 
274 |     @lru_cache()
275 |     @timed()
276 |     def get_tmp_val_test(self, type_):
277 |         _, _, test_all = get_train_test_bert()
278 | 
279 |         test = test_all.loc[pd.Series(test_all.index).str.startswith(type_).values]
280 | 
281 |         test = test.loc[(pd.Series(test.index).str[-1]=='0').values]
282 | 
283 |         logger.info(f'Split {type_}, {len(test)} rows from {len(test_all)}')
284 | 
285 |         test=test.copy()
286 |         type_ = 'x'*6 + pd.Series(test.index).str[:6]
287 |         test.index = 'x'*6 + pd.Series(test.index).str[6:]
288 | 
289 |         from spider.mi import get_train_ph2_index
290 |         train_ph2 =  get_train_ph2_index()
291 |         #final = final.loc[final.type_id.str.len() >= 1]
292 |         train_ph2.index = 'x'*6 + train_ph2['id'].str[6:]
293 |         #Align label with input test
294 |         index_old = test.index.copy()
295 |         test.index = pd.Series(test.index).apply(lambda val: val[:32])
296 | 
297 |         label = train_ph2.type_id.loc[test.index.values].str[:6] #type_id len is 6
298 | 
299 |         #Rollback index change
300 |         test.index = index_old
301 |         label.index = index_old
302 | 
303 |         test = test.loc[pd.notna(label).values]
304 |         label = label.dropna()
305 |         print('test, label, type_', test.shape, label.shape, type_.shape)
306 |         return test, label#, type_
307 | 
308 | 
309 |     def on_train_end(self, logs=None):
310 |         grow= max(self.score_list) - self.threshold
311 |         cut_ratio = get_args().cut_ratio
312 |         logger.info(f'Train END: Fold:{self.fold}, max:{max(self.score_list):7.6f}/{grow:+6.5f}, at {np.argmax(self.score_list)}/{len(self.score_list)-1}, his:{self.score_list}, max_bin:{self.max_bin}, cut:{cut_ratio}, min_len:{self.min_len:03}, SEQ_LEN:{SEQ_LEN:03}, threshold:{self.threshold:7.6f}, gen_file:{self.gen_file}')
313 |         logger.info(f'Input args:{get_args()}')
314 | 
315 |     def on_epoch_end(self, epoch, logs=None):
316 |         self.cur_epoch = epoch
317 |         print('\n')
318 |         _, _ = self.cal_acc_ex()
319 | 
320 |         if self.trial > 0:
321 |             return 0
322 |         else:
323 |             score_list, val = self.cal_acc()
324 |             total = score_list[1]
325 | 
326 |             self.score_list[epoch] = round(total, 6)
327 |             #threshold_map = {0:0.785, 1:0.77, 2:0.77, 3:0.77, 4:0.78}
328 |             top_cnt =2
329 |             top_score = self._get_top_score(self.fold)[:top_cnt]
330 |             self.threshold = top_score[0] if len(top_score) >  0 else 0
331 |             logger.info(f'The top#{top_cnt} score for max_bin:{get_args().max_bin}, epoch:{epoch}, oof:{oof_prefix}, fold#{self.fold} is:{top_score}, cur_score:{total}, threshold:{self.threshold}')
332 |             if ( round(total,4) > round(self.threshold,4)
333 |                  and (epoch>=3 or self.threshold > 0 )
334 |                  and total > self.max_score
335 |                 ) :
336 |                 #logger.info(f'Try to gen sub file for local score:{total}, and save to:{model_path}')
337 |                 self.gen_file=True
338 |                 grow = max(self.score_list) - self.threshold
339 |                 logger.info(f'Fold:{self.fold}, epoch:{epoch}, MAX:{max(self.score_list):7.6f}/{grow:+6.5f}, threshold:{self.threshold}, score_list:{self.score_list}' )
340 |                 test = self.gen_sub(self.model, f'{self.feature_len}_{total:7.6f}_{epoch}_f{self.fold}')
341 |                 len_raw_val = len(val.loc[val.bin == 0])
342 |                 min_len_ratio = get_args().min_len_ratio
343 |                 oof_file = f'./output/stacking/{oof_prefix}_{self.fold}_{total:7.6f}_{len_raw_val}_{len(val):05}_b{get_args().max_bin}_e{epoch}_{self.batch_id}_m{min_len_ratio:2.1f}_L{SEQ_LEN:03}.h5'
344 |                 self.save_stack_feature(val, test, oof_file)
345 |             else:
346 |                 logger.info(f'Epoch:{epoch}, only gen sub file if the local score >{self.threshold}, current score:{total}, threshold:{self.threshold}, max_score:{self.max_score}')
347 | 
348 |             self.max_score = max(self.max_score, total, 0.82)
349 | 
350 |             logger.info(f'Epoch#{epoch} END,max_bin:{get_args().max_bin}, oof:{oof_prefix}, max:{self.max_score:6.5f}, score:{score_list}, Fold:{self.fold},')
351 | 
352 |             print('\n')
353 | 
354 |             return round(total, 5)
355 | 
356 |     @staticmethod
357 |     @timed()
358 |     def save_stack_feature(train: pd.DataFrame, test: pd.DataFrame, file_path):
359 |         train.bin = train.bin.astype(int)
360 |         test.bin = test.bin.astype(int)
361 |         train.to_hdf(file_path, 'train', mode='a')
362 |         test.to_hdf(file_path, 'test', mode='a')
363 |         logger.info(f'OOF file save to :{file_path}')
364 |         return train, test
365 | 
366 | 
367 |     @timed()
368 |     #./output/model/1562899782/model_6114_0.65403_2.h5
369 |     def gen_sub(self, model , info='bert_' , partition_len = 5000):
370 | 
371 |         #frac = get_args().frac
372 |         _, _, test = get_train_test_bert()
373 | 
374 |         label2id, id2label = get_label_id()
375 |         input1_col = [col for col in test.columns if str(col).startswith('bert_')]
376 |         input3_col = [col for col in test.columns if str(col).startswith('fea_')]
377 | 
378 |         logger.info(f'Input input1_col:{len(input1_col)}, input3_col:{len(input3_col)}')
379 |         res_list = []
380 |         for sn in tqdm(range(1+ len(test)//partition_len), desc=f'{info}:sub:total:{len(test)},partition_len:{partition_len}'):
381 |             tmp = test.iloc[sn*partition_len: (sn+1)*partition_len]
382 |             #print('\nbegin tmp\n', tmp.iloc[:3,:3].head())
383 |             res = model.predict([ tmp.loc[:,input1_col], np.zeros_like(tmp.loc[:,input1_col]) ])
384 |             res = pd.DataFrame(res, columns=label2id.keys(), index=tmp.index)
385 |             #print('\nend tmp\n', res.iloc[:3, :3].head())
386 |             res_list.append(res)
387 | 
388 |         res = pd.concat(res_list)
389 |         res['bin'] = res.index.str[-1].values.astype(int)
390 |         raw_predict = res.copy()
391 | 
392 |         with timed_bolck(f'Try to gen sub file for fold#{self.fold}'):
393 |             #print('\nafter concat\n', res.iloc[:3, :3].head())
394 |             res['id'] = res.index
395 |             res.index.name = 'id'
396 |             # res.to_pickle(f'./output/tmp_sub.pkl')
397 | 
398 | 
399 |             #print('\nend res\n', res.iloc[:3, :3].head())
400 | 
401 | 
402 | 
403 |             res_mean = res.copy(deep=True)
404 |             res_mean['id'] = res_mean.id.apply(lambda val: val.split('_')[0])
405 |             res_mean.index.name = 'index'
406 |             res_select = res_mean.groupby('id')['bin'].agg({'bin_max': 'max'})
407 |             res_select.head()
408 |             res_select = res_select.loc[res_select.bin_max == 3]
409 |             res_mean = res_mean.loc[(res_mean.bin == 0)
410 |                                     | ((res_mean.bin == 1) & (res_mean.id.isin(res_select.index)))
411 |                                     ]
412 |             logger.info(f'Try to cal avg for res_mean:\n{res_mean.bin.value_counts()}')
413 |             res_mean_len = len(res_mean)
414 |             res_mean = res_mean.groupby('id').mean().sort_index()
415 |             del res_mean['bin']
416 | 
417 | 
418 |             res_0 = res.copy(deep=True)
419 |             res_0 = res_0.loc[res_0.bin == 0]
420 |             res_0.index  = res_0.id.apply(lambda val: val.split('_')[0])
421 |             #print('\nres_0\n', res_0.loc[:, ['id', 'bin']].head(3))
422 |             res_0 = res_0.sort_index()
423 |             res_0 = res_0.drop(columns=['id','bin'], axis=1, errors='ignore')
424 | 
425 |             for name, res in [('single',res_0), (f'mean_{res_mean_len}', res_mean)]:
426 |                 res = res.copy()
427 |                 #logger.info(f'{name} Check:\n{res.iloc[:3,:num_classes].sum(axis=1)}')
428 | 
429 |                 res['label1'] = res.iloc[:, :num_classes].idxmax(axis=1)
430 | 
431 |                 # Exclude top#1
432 |                 for index, col in res.label1.items():
433 |                     res.loc[index, col] = np.nan
434 | 
435 |                 res['label2'] = res.iloc[:, :num_classes].idxmax(axis=1)
436 | 
437 | 
438 |                 for col in ['label1','label2']:
439 |                     res[col] = res[col].replace(id2label)
440 | 
441 |                 # info = info.replace('.','')
442 |                 # sub_file = f'./output/sub/v19_{info}_{name}.csv'
443 |                 # res[['label1', 'label2']].to_csv(sub_file)
444 |                 # logger.info(f'Sub file save to :{sub_file}')
445 | 
446 |             #logger.info(f'res_0 Check:\n{res_0.iloc[:3, :num_classes].sum(axis=1)}')
447 | 
448 |         return raw_predict #res.drop(columns=['id','bin'], axis=1, errors='ignore')
449 | 
450 |     @staticmethod
451 |     def _get_top_score(fold):
452 |         from glob import glob
453 |         file_list = sorted(glob(f'./output/stacking/{oof_prefix}_{fold}_*.h5'), reverse=True)
454 |         score_list = [float(file.split('_')[2].replace('.h5', '')) for file in file_list]
455 |         logger.info(f'Score list for fold#{fold} is {score_list}')
456 |         return score_list if score_list else [0]
457 | 
458 | if __name__ == '__main__':
459 |     FUNCTION_MAP = {'train_base': train_base,
460 |                     }
461 | 
462 |     args = get_args()
463 | 
464 |     func = FUNCTION_MAP[args.command]
465 |     func()
466 | 
467 | """
468 | 
469 | nohup python -u ./core/bert.py --frac=0.1  train_base  > test.log 2>&1 &
470 | 
471 | nohup python -u ./core/bert.py --fold=4 --max_bin=2 train_base  > test_4.log 2>&1 &
472 | 
473 | python -u ./core/bert.py --max_bin=2 train_base 
474 | 
475 | nohup python -u ./core/bert.py train_base  > test.log 2>&1 &
476 | 
477 | nohup python -u ./core/bert.py train_base  > extend_bert_mean_bin_1.log 2>&1 &
478 | 
479 | nohup python -u ./core/bert.py boost_train 10 >> boost_1.log 2>&1 &
480 | 
481 | """


--------------------------------------------------------------------------------
/core/conf.py:
--------------------------------------------------------------------------------
 1 | from random import randrange
 2 | 
 3 | input_dir = './input/zip/'
 4 | 
 5 | type_dict = {
 6 |     'type_id': 'str',
 7 | 
 8 | }
 9 | 
10 | word2vec_tx, vector_size = './input/Tencent_AILab_ChineseEmbedding.txt',  200
11 | 
12 | word2vec_tx_mini = './input/mini_tx.kv'
13 | 
14 | num_classes = 126  #get_label_id()
15 | 
16 | 
17 | bert_wv = "./input/bert.kv"
18 | ####Bert Config
19 | import os
20 | pretrained_path = '/users/hdpsbp/HadoopDir/felix/xf_tag/input/roebert' #'./input/model/chinese_L-12_H-768_A-12'
21 | 
22 | if not os.path.exists(pretrained_path):
23 |     pretrained_path =  '/home/aladdin1/felix/robert'
24 | 
25 | #pretrained_path = './input/model/chinese_wwm_ext_L-12_H-768_A-12'
26 | config_path = os.path.join(pretrained_path, 'bert_config_large.json')
27 | checkpoint_path = os.path.join(pretrained_path, 'roberta_zh_large_model.ckpt')
28 | vocab_path = os.path.join(pretrained_path, 'vocab.txt')
29 | 
30 | check_type_list = ['stb', '50', '100', '200', '300','1000',
31 |                    #'a2', 'a3', 'bd',
32 |                     ]
33 | 
34 | #######
35 | 
36 | xlnet_path='/users/hdpsbp/HadoopDir/felix/xlnet'


--------------------------------------------------------------------------------
/core/del.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import numpy as np
 5 | 
 6 | from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI
 7 | 
 8 | 
 9 | 
10 | '''Can be found at https://github.com/ymcui/Chinese-PreTrained-XLNet'''
11 | checkpoint_path = '/users/hdpsbp/HadoopDir/felix/xlnet'
12 | vocab_path = os.path.join(checkpoint_path, 'spiece.model')
13 | config_path = os.path.join(checkpoint_path, 'xlnet_config.json')
14 | model_path = os.path.join(checkpoint_path, 'xlnet_model.ckpt')
15 | 
16 | # Tokenize inputs
17 | tokenizer = Tokenizer(vocab_path)
18 | text = "给岁月以文明"
19 | tokens = tokenizer.encode(text)
20 | 
21 | 


--------------------------------------------------------------------------------
/core/ensemble.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from core.conf import *
  4 | from core.feature import *
  5 | 
  6 | static_list = [
  7 | # './output/stacking/v6_0_0.804024_6006_10605_b1_e1_m50.h5',
  8 | # './output/stacking/v6_0_0.804532_6006_10605_b2_e1_m50.h5',
  9 | # './output/stacking/v6_1_0.789411_5918_10451_b2_e1_m50.h5',
 10 | # './output/stacking/v6_1_0.792143_5918_10451_b1_e1_m50.h5',
 11 | # './output/stacking/v6_2_0.790876_6237_11068_b1_e1_m50.h5',
 12 | # './output/stacking/v6_2_0.791542_6237_11068_b1_e1_m50.h5',
 13 | # './output/stacking/v6_3_0.799421_5996_10562_b1_e1_m50.h5',
 14 | # './output/stacking/v6_3_0.801635_5996_10562_b1_e1_m50.h5',
 15 | # './output/stacking/v6_4_0.765271_6977_12388_b4_e1_m20.h5',
 16 | # './output/stacking/v6_4_0.766215_6977_06977_b0_e1_m20.h5',
 17 | ]
 18 | @lru_cache()
 19 | def get_top_file(fold,version):
 20 |     from glob import glob
 21 |     file_list = sorted(glob(f'./output/stacking/{version}_{fold}_*.h5'), reverse=True)
 22 | 
 23 |     if static_list:
 24 |         file_list = [ file for file in file_list if file in static_list]
 25 |     return file_list
 26 | 
 27 | @lru_cache()
 28 | def get_file_list(version, top=2,):
 29 |     file_list  = []
 30 |     for fold in range(5):
 31 |         tmp = get_top_file(fold, version)
 32 |         if len(tmp) < top:
 33 |             logger.warning(f'At least need {top} files for fold:{fold}')
 34 |         file_list = file_list + tmp[:top]
 35 |     return tuple(file_list)
 36 | 
 37 | @lru_cache()
 38 | @timed()
 39 | def get_feature_oof(file_list, weight=1,base_train=True):
 40 | 
 41 |     train_list = []
 42 |     test_list = []
 43 | 
 44 |     for file in tqdm(file_list,f'gen oof from {len(file_list)} files'):
 45 |         cur_weight = weight if weight > 0 else get_best_weight(file, base_train=base_train)
 46 | 
 47 |         #Train begin
 48 |         tmp = pd.read_hdf(file, 'train')
 49 |         col_list = tmp.columns[:num_classes]
 50 |         tmp['app_id'] = tmp.index.str[:32].values
 51 |         tmp['bin'] = tmp.index.str[-1].values.astype(int)
 52 |         tmp = tmp.sort_values(['app_id', 'bin', 'label'])
 53 |         tmp = tmp.drop_duplicates(['app_id', 'bin'])
 54 | 
 55 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight
 56 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight)
 57 |         tmp.label = tmp.label.astype(int)
 58 |         tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean()
 59 | 
 60 |         train_list.append(tmp)
 61 | 
 62 |         #Test begin
 63 |         tmp = pd.read_hdf(file, 'test')
 64 |         tmp['app_id'] = tmp.index.str[:32].values
 65 |         tmp['bin'] = tmp.index.str[-1].values.astype(int)
 66 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight
 67 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight)
 68 |         tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean()
 69 |         test_list.append(tmp)
 70 | 
 71 |     train = pd.concat(train_list)
 72 | 
 73 |     test = pd.concat(test_list)
 74 | 
 75 |     oof = pd.concat([train, test])
 76 |     print('oof, before=', oof.shape)
 77 |     oof = oof.groupby(oof.index).mean()
 78 |     print('oof, after=', oof.shape)
 79 |     del oof['bin']
 80 |     oof.label = oof.label.fillna(0).astype(int).astype(str)
 81 |     return oof
 82 | 
 83 | @timed()
 84 | def gen_sub_file(res, file_name, topn=2):
 85 |     res = res.copy()
 86 |     res_raw = res.copy()
 87 | 
 88 |     for i in tqdm(range(1, 1+topn), desc=f'Cal label#1-{topn} value for res:{res.shape}'):
 89 |         res.loc[:, f'label{i}'] = res.iloc[:, :num_classes].idxmax(axis=1)
 90 |         res_raw.loc[:, f'label{i}'] = res.loc[:, f'label{i}']
 91 | 
 92 |         for index, col in res[f'label{i}'].items():
 93 |             res.loc[index, col] = np.nan
 94 | 
 95 | 
 96 |     if file_name:
 97 |         from spider.mi import get_train_ph2_index
 98 |         train_ph2 = get_train_ph2_index()
 99 | 
100 |         res_bk = res.copy().loc[~res.index.str[6:].isin(train_ph2.id.str[6:].values)]
101 |         for res in [res, res_bk]:
102 |             res.index.name = 'id'
103 |             sub_file = f'./output/sub/{len(res)}_{file_name}'
104 |             res[['label1', 'label2']].to_csv(sub_file)
105 |             logger.info(f'Sub file save to :{sub_file}')
106 | 
107 |     return res_raw
108 | 
109 | 
110 | 
111 | @timed()
112 | def get_best_weight(file, base_train):
113 |     import pandas as pd
114 |     if base_train:
115 |         df = pd.read_hdf(file, 'train')
116 |     else:
117 |         df = pd.read_hdf(file, 'test')
118 | 
119 |         from spider.mi import get_train_ph2_index
120 |         ph2_train = get_train_ph2_index()
121 |         ph2_train = ph2_train.set_index('id')
122 |         df = df.loc[pd.Series(df.index).str[:32].isin(ph2_train.index).values]
123 |         df['label'] = ph2_train.loc[pd.Series(df.index).str[:32]].type_id.str[:6].values.astype(int)
124 | 
125 | 
126 |     df['bin'] = df.index.str[-1].astype(int)
127 | 
128 |     col_list = df.columns[:num_classes]
129 |     #print(col_list)
130 |     df['bin'] = df.index.str[-1].astype(int)
131 |     df['app_id'] = df.index.str[:32]
132 | 
133 |     if len(df.loc[df.bin==1]) ==0 :
134 |         return 1
135 | 
136 |     print(df.bin.value_counts())
137 |     df = df.sort_values(['app_id', 'bin', 'label'])
138 |     df = df.drop_duplicates(['app_id', 'bin'])
139 | 
140 |     score ={}
141 | 
142 |     for weight in tqdm(np.arange(0.7, 1.01, 0.05), desc=f'Cal best for {file}'):
143 |         weight = round(weight, 2)
144 |         tmp = df.copy()
145 |         # print(tmp.label.head(3))
146 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * weight
147 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - weight)
148 | 
149 |         # tmp = tmp.loc[tmp.bin==0]
150 |         tmp = tmp.loc[tmp.bin.isin([0, 1])]
151 |         #print(tmp.bin.value_counts())
152 |         tmp = tmp.groupby('app_id').mean()
153 | 
154 |         # print(tmp.shape)
155 |         # print(tmp.label.head(3))
156 |         tmp.label = tmp.label.astype(int)
157 |         # print(tmp.shape)
158 |         score_list = accuracy(tmp)
159 |         logger.info(f'weight:{weight}, score_list:{score_list}. base_train:{base_train}, File:{file}')
160 |         total = score_list[1]
161 |         score[weight] = total
162 | 
163 |     logger.info(f'Score list for file:{file}\n{score}')
164 | 
165 |     base_score = list(score.values())[-1]
166 | 
167 |     score = sorted(score.items(), key=lambda kv: kv[1])
168 |     best_score = score[-1][-1]
169 |     best_weight = score[-1][0]
170 |     grow = best_score-base_score
171 | 
172 |     logger.info(f'====best_weight:{best_weight:3.2}, best_score:{best_score:6.5f}/{grow:6.5f},base_train:{base_train},File:{file}')
173 |     return best_weight
174 | 
175 | 
176 | def compare(file='./output/sub/80000_v36_07_bt_True_mean_top2_000_865700.csv'):
177 |     df = pd.read_csv(file)
178 | 
179 |     from spider.mi import get_final_feature
180 |     final = get_final_feature()
181 | 
182 |     df = pd.merge(final, df, how='left', on='id')
183 | 
184 |     def check(row):
185 |         if len(str(row.type_id)) == 0:
186 |             return None
187 | 
188 |         label_list = row.type_id.split('|')
189 | 
190 |         return str(row.label1) in label_list or str(row.label2) in label_list
191 | 
192 |     df['is_corr'] = df.apply(lambda row: check(row), axis=1)
193 | 
194 |     print(df.shape, '\n', df.is_corr.value_counts())
195 |     df = df.loc[df.is_corr == False]
196 | 
197 |     type_name = get_app_type()
198 |     type_name = type_name.set_index('type_id')
199 |     type_name.index = type_name.index.astype(str)
200 | 
201 |     df.label1 = df.label1.astype(str).replace(type_name.to_dict()['type_name'])
202 |     df.label2 = df.label2.astype(str).replace(type_name.to_dict()['type_name'])
203 |     df.type_id = df.type_id.astype(str).replace(type_name.to_dict()['type_name'])
204 | 
205 |     print(df['from'].value_counts())
206 | 
207 |     return df
208 | 
209 | @timed()
210 | def main():
211 |     for top in [4]:
212 |         for weight in [0]:
213 |             version = get_args().version
214 |             with timed_bolck(f'Cal sub for top:{top}, weight:{weight:3.2f}, version:{version}'):
215 |                 for base_train in [True]:
216 | 
217 |                     file_list_1 = get_file_list('v36', top)
218 |                     file_list_2 = get_file_list('v43', top)
219 |                     file_list_3 = get_file_list('v72', top)
220 |                     file_list_4 = get_file_list('v73', top)
221 | 
222 | 
223 |                     file_list = file_list_1 +  file_list_2 + file_list_3 + file_list_4
224 |                     logger.info(f'File List:{file_list}')
225 | 
226 |                     res = get_feature_oof(file_list, weight, base_train)
227 | 
228 |                     train = res.loc[res.label != '0']
229 |                     score_list = accuracy(train)
230 |                     total = score_list[1]
231 | 
232 |                     res.to_csv(f'./output/{version}_bt_{base_train}_ex_change_file_top{top}_w{weight}_{int(total * 10 ** 6):06}.csv')
233 |                     file_name = f'{version}_{len(file_list):02}_bt_{base_train}_mean_top{top}_{int(weight * 100):03}_{int(total * 10 ** 6):06}.csv'
234 |                     res = gen_sub_file(res.loc[res.label == '0'], file_name)
235 |                     # logger.info(f'Sub file save to:{file_name}')
236 | 
237 | 
238 | if __name__== '__main__':
239 |     FUNCTION_MAP = {'main': main,  }
240 | 
241 |     args = get_args()
242 | 
243 |     func = FUNCTION_MAP[args.command]
244 |     func()
245 | 
246 | 
247 | 
248 | """
249 | nohup python -u ./core/ensemble.py  main  >> ensemble.log 2>&1 &
250 | """
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/core/ensemble_new.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from core.conf import *
  4 | from core.feature import *
  5 | 
  6 | static_list = [
  7 | # './output/stacking/v6_0_0.804024_6006_10605_b1_e1_m50.h5',
  8 | # './output/stacking/v6_0_0.804532_6006_10605_b2_e1_m50.h5',
  9 | # './output/stacking/v6_1_0.789411_5918_10451_b2_e1_m50.h5',
 10 | # './output/stacking/v6_1_0.792143_5918_10451_b1_e1_m50.h5',
 11 | # './output/stacking/v6_2_0.790876_6237_11068_b1_e1_m50.h5',
 12 | # './output/stacking/v6_2_0.791542_6237_11068_b1_e1_m50.h5',
 13 | # './output/stacking/v6_3_0.799421_5996_10562_b1_e1_m50.h5',
 14 | # './output/stacking/v6_3_0.801635_5996_10562_b1_e1_m50.h5',
 15 | # './output/stacking/v6_4_0.765271_6977_12388_b4_e1_m20.h5',
 16 | # './output/stacking/v6_4_0.766215_6977_06977_b0_e1_m20.h5',
 17 | ]
 18 | @lru_cache()
 19 | def get_top_file(fold,version):
 20 |     from glob import glob
 21 |     file_list = sorted(glob(f'./output/stacking/{version}_{fold}_*.h5'), reverse=True)
 22 | 
 23 |     if static_list:
 24 |         file_list = [ file for file in file_list if file in static_list]
 25 |     return file_list
 26 | 
 27 | @lru_cache()
 28 | def get_file_list(version, top=2,):
 29 |     file_list  = []
 30 |     for fold in range(5):
 31 |         tmp = get_top_file(fold, version)
 32 |         if len(tmp) < top:
 33 |             logger.warning(f'At least need {top} files for fold:{fold}')
 34 |         file_list = file_list + tmp[:top]
 35 |     return tuple(file_list)
 36 | 
 37 | @lru_cache()
 38 | @timed()
 39 | def get_feature_oof(file_list, weight=1,base_train=True):
 40 | 
 41 |     train_list = []
 42 |     test_list = []
 43 | 
 44 |     for file in tqdm(file_list,f'gen oof from {len(file_list)} files'):
 45 |         cur_weight = weight if weight > 0 else get_best_weight(file, base_train=base_train)
 46 | 
 47 |         #Train begin
 48 |         tmp = pd.read_hdf(file, 'train')
 49 |         col_list = tmp.columns[:num_classes]
 50 |         tmp['app_id'] = tmp.index.str[:32].values
 51 |         tmp['bin'] = tmp.index.str[-1].values.astype(int)
 52 |         tmp = tmp.sort_values(['app_id', 'bin', 'label'])
 53 |         tmp = tmp.drop_duplicates(['app_id', 'bin'])
 54 | 
 55 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight
 56 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight)
 57 |         tmp.label = tmp.label.astype(int)
 58 |         tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean()
 59 | 
 60 |         train_list.append(tmp)
 61 | 
 62 |         #Test begin
 63 |         tmp = pd.read_hdf(file, 'test')
 64 |         tmp['app_id'] = tmp.index.str[:32].values
 65 |         tmp['bin'] = tmp.index.str[-1].values.astype(int)
 66 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * cur_weight
 67 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - cur_weight)
 68 |         tmp = tmp.loc[tmp.bin.isin([0, 1])].groupby('app_id').mean()
 69 |         test_list.append(tmp)
 70 | 
 71 |     train = pd.concat(train_list)
 72 | 
 73 |     test = pd.concat(test_list)
 74 | 
 75 |     oof = pd.concat([train, test])
 76 |     print('oof, before=', oof.shape)
 77 |     oof = oof.groupby(oof.index).mean()
 78 |     print('oof, after=', oof.shape)
 79 |     del oof['bin']
 80 |     oof.label = oof.label.fillna(0).astype(int).astype(str)
 81 |     return oof
 82 | 
 83 | @timed()
 84 | def gen_sub_file(res, file_name, topn=2):
 85 |     res = res.copy()
 86 |     res_raw = res.copy()
 87 | 
 88 |     for i in tqdm(range(1, 1+topn), desc=f'Cal label#1-{topn} value for res:{res.shape}'):
 89 |         res.loc[:, f'label{i}'] = res.iloc[:, :num_classes].idxmax(axis=1)
 90 |         res_raw.loc[:, f'label{i}'] = res.loc[:, f'label{i}']
 91 | 
 92 |         for index, col in res[f'label{i}'].items():
 93 |             res.loc[index, col] = np.nan
 94 | 
 95 | 
 96 |     if file_name:
 97 |         from spider.mi import get_train_ph2_index
 98 |         train_ph2 = get_train_ph2_index()
 99 | 
100 |         res_bk = res.copy().loc[~res.index.str[6:].isin(train_ph2.id.str[6:].values)]
101 |         for res in [res, res_bk]:
102 |             res.index.name = 'id'
103 |             sub_file = f'./output/sub/{len(res)}_{file_name}'
104 |             res[['label1', 'label2']].to_csv(sub_file)
105 |             logger.info(f'Sub file save to :{sub_file}')
106 | 
107 |     return res_raw
108 | 
109 | 
110 | 
111 | @timed()
112 | def get_best_weight(file, base_train):
113 |     import pandas as pd
114 |     if base_train:
115 |         df = pd.read_hdf(file, 'train')
116 |     else:
117 |         df = pd.read_hdf(file, 'test')
118 | 
119 |         from spider.mi import get_train_ph2_index
120 |         ph2_train = get_train_ph2_index()
121 |         ph2_train = ph2_train.set_index('id')
122 |         df = df.loc[pd.Series(df.index).str[:32].isin(ph2_train.index).values]
123 |         df['label'] = ph2_train.loc[pd.Series(df.index).str[:32]].type_id.str[:6].values.astype(int)
124 | 
125 | 
126 |     df['bin'] = df.index.str[-1].astype(int)
127 | 
128 |     col_list = df.columns[:num_classes]
129 |     #print(col_list)
130 |     df['bin'] = df.index.str[-1].astype(int)
131 |     df['app_id'] = df.index.str[:32]
132 | 
133 |     if len(df.loc[df.bin==1]) ==0 :
134 |         return 1
135 | 
136 |     print(df.bin.value_counts())
137 |     df = df.sort_values(['app_id', 'bin', 'label'])
138 |     df = df.drop_duplicates(['app_id', 'bin'])
139 | 
140 |     score ={}
141 | 
142 |     for weight in tqdm(np.arange(0.7, 1.01, 0.05), desc=f'Cal best for {file}'):
143 |         weight = round(weight, 2)
144 |         tmp = df.copy()
145 |         # print(tmp.label.head(3))
146 |         tmp.loc[tmp.bin == 0, col_list] = tmp.loc[tmp.bin == 0, col_list] * weight
147 |         tmp.loc[tmp.bin == 1, col_list] = tmp.loc[tmp.bin == 1, col_list] * (1 - weight)
148 | 
149 |         # tmp = tmp.loc[tmp.bin==0]
150 |         tmp = tmp.loc[tmp.bin.isin([0, 1])]
151 |         #print(tmp.bin.value_counts())
152 |         tmp = tmp.groupby('app_id').mean()
153 | 
154 |         # print(tmp.shape)
155 |         # print(tmp.label.head(3))
156 |         tmp.label = tmp.label.astype(int)
157 |         # print(tmp.shape)
158 |         score_list = accuracy(tmp)
159 |         logger.info(f'weight:{weight}, score_list:{score_list}. base_train:{base_train}, File:{file}')
160 |         total = score_list[1]
161 |         score[weight] = total
162 | 
163 |     logger.info(f'Score list for file:{file}\n{score}')
164 | 
165 |     base_score = list(score.values())[-1]
166 | 
167 |     score = sorted(score.items(), key=lambda kv: kv[1])
168 |     best_score = score[-1][-1]
169 |     best_weight = score[-1][0]
170 |     grow = best_score-base_score
171 | 
172 |     logger.info(f'====best_weight:{best_weight:3.2}, best_score:{best_score:6.5f}/{grow:6.5f},base_train:{base_train},File:{file}')
173 |     return best_weight
174 | 
175 | 
176 | def compare(file='./output/sub/80000_v36_07_bt_True_mean_top2_000_865700.csv'):
177 |     df = pd.read_csv(file)
178 | 
179 |     from spider.mi import get_final_feature
180 |     final = get_final_feature()
181 | 
182 |     df = pd.merge(final, df, how='left', on='id')
183 | 
184 |     def check(row):
185 |         if len(str(row.type_id)) == 0:
186 |             return None
187 | 
188 |         label_list = row.type_id.split('|')
189 | 
190 |         return str(row.label1) in label_list or str(row.label2) in label_list
191 | 
192 |     df['is_corr'] = df.apply(lambda row: check(row), axis=1)
193 | 
194 |     print(df.shape, '\n', df.is_corr.value_counts())
195 |     df = df.loc[df.is_corr == False]
196 | 
197 |     type_name = get_app_type()
198 |     type_name = type_name.set_index('type_id')
199 |     type_name.index = type_name.index.astype(str)
200 | 
201 |     df.label1 = df.label1.astype(str).replace(type_name.to_dict()['type_name'])
202 |     df.label2 = df.label2.astype(str).replace(type_name.to_dict()['type_name'])
203 |     df.type_id = df.type_id.astype(str).replace(type_name.to_dict()['type_name'])
204 | 
205 |     print(df['from'].value_counts())
206 | 
207 |     return df
208 | 
209 | @timed()
210 | @file_cache()
211 | def get_oof_version(version, top, weight):
212 | 
213 | 
214 |     with timed_bolck(f'Cal sub for top:{top}, weight:{weight:3.2f}, version:{version}'):
215 |         for base_train in [True]:
216 |             file_list = get_file_list(version, top)
217 | 
218 |             # file_list = file_list_1 +  file_list_2 + file_list_3 + file_list_4
219 |             logger.info(f'File List {version} :{file_list}')
220 | 
221 |             res = get_feature_oof(file_list, weight, base_train)
222 | 
223 |     return res#, len(file_list)
224 | 
225 | @timed()
226 | def main():
227 |     oof_list  = []
228 |     file_cnt = 0
229 |     top = 4
230 |     weight = 0
231 |     oof_weight_list = []
232 |     for version, w in zip (['v36', 'v43','v72','v73','v74','v75'], [0.7 ,0.7,0.8,1,1,1 ] ):
233 |         #version = get_args().version
234 | 
235 |         res = get_oof_version(version, top, weight)
236 |         #Align all the porbiblity to 1
237 | 
238 | 
239 |         train = res.loc[res.label != '0']
240 |         # score_list = accuracy(train)
241 |         # #oof_weight_list.append((score_list[1]))
242 |         # logger.info(f'Score for train{train.shape}/{res.shape}:{version}:{score_list}')
243 | 
244 |         res.iloc[:, :-1] =  w * res.iloc[:, :-1].apply(lambda row: row / row.sum(), axis=1)
245 | 
246 |         oof_list.append(res)
247 | 
248 | 
249 |     oof = pd.concat(oof_list)
250 | 
251 |     oof.to_pickle(f'./output/{file_cnt:02}_{len(oof_list)}_tmp_res_val.pkl')
252 | 
253 |     label_raw = oof_list[0].label#.drop_duplicates()
254 |     print('oof, final before=', oof.shape)
255 |     oof = oof.groupby(oof.index).mean()
256 |     #oof.iloc[:, :-1] = oof.iloc[:, :-1].apply(lambda row: row / row.sum(), axis=1)
257 | 
258 |     print('oof, final after=', oof.shape)
259 |     oof['label'] = label_raw
260 | 
261 |     oof['label'] = oof['label'].fillna('0')
262 | 
263 |     res =  oof
264 |     train = res.loc[res.label != '0']
265 | 
266 |     score_list = accuracy(train)
267 |     total = score_list[1]
268 |     logger.info(f'get the final score:{total} base on train:{train.shape}')
269 | 
270 |     ex_file = f'./output/{version}_bt_change_file_top{top}_w{weight}_{int(total * 10 ** 6):06}.csv'
271 |     res.to_csv(ex_file)
272 |     logger.info(f'Exchange file save to:{ex_file}')
273 |     file_name = f'{version}_{file_cnt:02}_new_mean_top{top}_{int(weight * 100):03}_{int(total * 10 ** 6):06}.csv'
274 |     res = gen_sub_file(res.loc[res.label == '0'], file_name)
275 | 
276 | 
277 | 
278 | if __name__== '__main__':
279 |     FUNCTION_MAP = {'main': main,  }
280 | 
281 |     args = get_args()
282 | 
283 |     func = FUNCTION_MAP[args.command]
284 |     func()
285 | 
286 | 
287 | 
288 | """
289 | nohup python -u ./core/ensemble_new.py  main  >> ensemble_final.log 2>&1 &
290 | """
291 | 
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/core/mini.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from file_cache.utils.util_log import *
  3 | from core.conf import *
  4 | 
  5 | vector_size = 200
  6 | 
  7 | def gen_mini_embedding(wv_from_text, word_list):
  8 |     from multiprocessing.dummy import Pool
  9 | 
 10 |     from functools import partial
 11 | 
 12 |     partition_num = 8
 13 |     import math
 14 |     partition_length = math.ceil(len(word_list)/partition_num)
 15 | 
 16 |     partition_list = [ word_list[i:i+partition_length]  for i in range(0, len(word_list), partition_length )]
 17 |     logger.debug(f'The word list split to {len(partition_list)} partitions:{[ len(partition) for partition in partition_list]}')
 18 |     thread_pool = Pool(processes=partition_num)
 19 |     process = partial(gen_mini_partition,wv_from_text=wv_from_text )
 20 | 
 21 |     wv_list = thread_pool.map(process, partition_list)
 22 |     thread_pool.close(); thread_pool.join()
 23 | 
 24 |     del wv_from_text
 25 | 
 26 |     return pd.concat(wv_list)
 27 | 
 28 | 
 29 | def compute_ngrams(word, min_n, max_n):
 30 |     # BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
 31 |     extended_word = word
 32 |     ngrams = []
 33 |     for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
 34 |         for i in range(0, len(extended_word) - ngram_length + 1):
 35 |             ngrams.append(extended_word[i:i + ngram_length])
 36 |     res =  list(set(ngrams))
 37 |     return res
 38 | 
 39 | def wordVec(word,wv_from_text:dict,min_n = 1, max_n = 3):
 40 |     '''
 41 |     ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量
 42 |     '''
 43 | 
 44 |     # 如果在词典之中，直接返回词向量
 45 |     if word in wv_from_text.index:
 46 |         return wv_from_text.loc[word]
 47 |     else:
 48 |         logger.warning(f'Cannot find this word directly:{word}')
 49 |         word_size = vector_size
 50 |         # 计算word的ngrams词组
 51 |         ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n)
 52 |         # 不在词典的情况下
 53 |         word_vec = np.zeros(word_size, dtype=np.float32)
 54 |         ngrams_found = 0
 55 |         ngrams_single = [ng for ng in ngrams if len(ng) == 1]
 56 |         ngrams_more = [ng for ng in ngrams if len(ng) > 1]
 57 |         # 先只接受2个单词长度以上的词向量
 58 |         for ngram in ngrams_more:
 59 |             if ngram in wv_from_text.index:
 60 |                 word_vec += wv_from_text.loc[ngram]
 61 |                 ngrams_found += 1
 62 |                 #print(ngram)
 63 |         # 如果，没有匹配到，那么最后是考虑单个词向量
 64 |         if ngrams_found == 0:
 65 |             for ngram in ngrams_single:
 66 |                 if ngram in wv_from_text.index:
 67 |                     word_vec += wv_from_text.loc[ngram]
 68 |                     ngrams_found += 1
 69 |                 elif ngram.lower() in wv_from_text.index:
 70 |                     word_vec += wv_from_text.loc[ngram.lower()]
 71 |                     ngrams_found += 1
 72 |                 else:
 73 |                     logger.warning(f'Can not find {ngram} in wv')
 74 |         if ngrams_found > 0:
 75 |             return word_vec / max(1, ngrams_found)
 76 |         else:
 77 |             logger.error('all ngrams for word "%s" absent from model' % word)
 78 |             return None
 79 | 
 80 | @timed()
 81 | def gen_mini_partition(word_set, wv_from_text):
 82 | 
 83 |     mini = pd.DataFrame(np.zeros((len(word_set), vector_size)), index=word_set, )
 84 |     # for i in tqdm(range(len(word_set))):
 85 |     for i in range(len(word_set)):
 86 |         word = word_set[i]
 87 |         vector = wordVec(word, wv_from_text, 1, 3)
 88 |         if vector is not None:
 89 |             mini.loc[word] = vector
 90 |         else:
 91 |             logger.debug(f'Can not find vec for:{len(word)},{word}')
 92 |             mini.loc[word] = np.zeros(vector_size)
 93 | 
 94 |     return mini
 95 | 
 96 | @timed()
 97 | def gen_tx_mini():
 98 |     #word2vec_tx, vector_size = './input/Tencent_AILab_ChineseEmbedding.txt', 200
 99 | 
100 |     from core.feature import load_embedding, get_word_cnt
101 | 
102 |     embed = load_embedding(word2vec_tx, type='txt')
103 |     word_list = get_word_cnt()
104 |     logger.info(word_list[:5])
105 |     data = gen_mini_embedding(embed, word_list.word.values)
106 | 
107 |     logger.debug(f'The length of the vector is {data.shape}')
108 | 
109 |     fname = "./input/mini_tx.kv"
110 |     np.savetxt(fname, data.reset_index().values,
111 |                delimiter=" ",
112 |                header="{} {}".format(len(data), len(data.columns)),
113 |                comments="",
114 |                fmt=["%s"] + ["%.6f"] * len(data.columns))
115 | 
116 |     logger.info(f'Mini dict save to {fname}')
117 | 
118 | if __name__ == '__main__':
119 |    from fire import Fire
120 |    Fire()
121 | 
122 |    """
123 |    nohup python -u core/mini.py gen_tx_mini > mini.log 2>&1 &
124 |    """


--------------------------------------------------------------------------------
/core/split.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import StratifiedKFold
 2 | 
 3 | from core.feature import *
 4 | 
 5 | 
 6 | def get_split_group(random_state=2019):
 7 |     apptype_train = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\t',
 8 |                                 names=['app_id', 'type_id', 'app_des'],
 9 |                                 quoting=3,
10 |                                 )
11 | 
12 |     apptype_train = apptype_train.sort_values('app_id')
13 |     folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
14 | 
15 |     gp_list = list(folds.split(apptype_train, apptype_train.type_id.astype('category').cat.codes))
16 | 
17 |     train_list = [apptype_train.iloc[gp, 0].values for gp, _ in gp_list]
18 | 
19 |     val_list = [apptype_train.iloc[gp, 0].values for _, gp in gp_list]
20 | 
21 |     return train_list, val_list
22 | 
23 | 
24 | @timed()
25 | def split_df_by_index_no_bin(df, fold):
26 |     #
27 |     # sn = pd.Series(df.index).str[-1].astype(int)
28 |     df = pd.Series(df.index).str[:32]
29 | 
30 | 
31 | 
32 |     train_list, val_list = get_split_group()
33 |     train_gp = train_list[fold]
34 |     val_gp = val_list[fold]
35 | 
36 |     return df.loc[(df.isin(train_gp))].index.values, \
37 |            df.loc[(df.isin(val_gp)) ].index.values
38 | 
39 | 
40 | def split_df_by_index(df, fold):
41 |     index = df.index
42 |     app_id = pd.Series(index).apply(lambda val: val.split('_')[0])
43 |     bin   = pd.Series(index).apply(lambda val: val.split('_')[-1]).astype(int)
44 |     df = pd.concat([app_id,bin], axis=1)
45 |     df.columns = ['app_id', 'bin']
46 | 
47 |     #print(df.shape, df.head)
48 |     train_list, val_list = get_split_group()
49 |     train_gp = train_list[fold]
50 |     val_gp = val_list[fold]
51 | 
52 |     train_bin = list(range(get_args().max_bin+1))
53 | 
54 |     val_bin= train_bin #[0,1]
55 | 
56 |     logger.info(f'split base on: train_bin:{train_bin}, val_bin:{val_bin}')
57 |     logger.info(f'The original bin_id distribution in train data set:\n {df.loc[(df.app_id.isin(train_bin))].bin.value_counts()}')
58 | 
59 |     logger.info(f'The original bin_id distribution in val data set:\n{ df.loc[(df.app_id.isin(val_gp))].bin.value_counts() } ')
60 | 
61 |     return df.loc[(df.app_id.isin(train_gp)) & (df.bin.isin(train_bin))].index.values, \
62 |            df.loc[(df.app_id.isin(val_gp)) &   (df.bin.isin(val_bin))].index.values
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     for random in range(2019, 2099):
67 |         train_list, val_list = get_split_group(random)
68 |         gp = [len(val)  for val  in val_list]
69 |         print(np.array(gp).std(), gp, random)
70 | 


--------------------------------------------------------------------------------
/core/xlnet.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from multiprocessing import Process
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | from core.feature_xlnet import *
  9 | from core.conf import *
 10 | 
 11 | import os
 12 | 
 13 | 
 14 | os.environ['TF_KERAS'] = '1'
 15 | 
 16 | oof_prefix = get_args().version
 17 | SEQ_LEN = get_args().seq_len  #randrange(128, 180) #-randrange(0, 5)*8
 18 | BATCH_SIZE = get_args().batch_size
 19 | 
 20 | #Batch size, MAX_len+ex_length, Manual, Manual GP feature cnt, frac
 21 | @lru_cache()
 22 | @timed()
 23 | def get_train_test_bert():
 24 | 
 25 |     frac = get_args().frac
 26 |     max_bin = get_args().max_bin
 27 |     min_len = int(SEQ_LEN*get_args().min_len_ratio)
 28 | 
 29 |     data = get_feature_xlnet(SEQ_LEN)
 30 | 
 31 |     #Keep all the bin group, if it's test data
 32 |     data = data.loc[(data.bin<=max_bin) | (pd.isna(data.type_id))]
 33 | 
 34 |     with timed_bolck(f'Remove gan data, and len is less then {min_len}'):
 35 |         data = data.loc[ (data.bin == 0) | (data['len_'] >= min_len) ]
 36 |         logger.info(f'Train max_bin:{max_bin},Total Bin distribution:\n{data.bin.value_counts().sort_index()}')
 37 | 
 38 |     data = data.sort_index()
 39 |     logger.info(f'Head of the data:\n, {data.iloc[:3,:3]}')
 40 | 
 41 |     train_data = data.loc[pd.notna(data.type_id)].sample(frac=frac, random_state=2019)
 42 |     labels = train_data.type_id.values.tolist()
 43 |     logger.info(f'Train Bin distribution:\n{train_data.bin.value_counts().sort_index()}')
 44 | 
 45 |     test_data =  data.loc[pd.isna(data.type_id)].sample(frac=1, random_state=2019)
 46 | 
 47 |     trial = get_args().trial
 48 |     logger.info(f'Test Bin distribution#{trial}:\n{test_data.bin.value_counts().sort_index()}')
 49 | 
 50 |     if trial > 0:
 51 |         test_data = test_data.loc[test_data.index.str[-1]=='0']
 52 | 
 53 | 
 54 |     logger.info(f'Train:{train_data.shape} Test#{trial}:{test_data.shape}, frac:{frac}')
 55 | 
 56 |     feature_col = [col for col in data.columns if col.startswith('fea_') or col.startswith('bert_')]
 57 | 
 58 |     label2id, id2label = get_label_id()
 59 |     #word2id = get_word2id()
 60 | 
 61 |     # Encode input words and labels
 62 |     X = train_data.loc[:, feature_col]
 63 |     Y = [label2id[label] for label in labels]
 64 | 
 65 | 
 66 |     X_test = test_data.loc[:, feature_col]
 67 | 
 68 | 
 69 |     return  X, pd.Series(Y, index=train_data.index), X_test
 70 | 
 71 | 
 72 | # X, y, X_test = get_train_test_bert(0.1)
 73 | #
 74 | #
 75 | # train_x, train_y = load_data(train_path)
 76 | # test_x, test_y = load_data(test_path)
 77 | 
 78 | def boost_train(boost=10):
 79 |     for _ in range(boost):
 80 |         p = Process(target=train_base)
 81 |         p.start()
 82 |         p.join()
 83 | 
 84 | 
 85 | @timed()
 86 | def filter_short_desc(X, y):
 87 |     X = X.copy().reset_index()
 88 |     bert_cols = [col for col in X.columns if str(col).startswith('bert_')]
 89 |     bert = X.loc[:, bert_cols]
 90 |     bert_len = bert.where(bert > 0).count(axis=1)
 91 |     old_len = len(bert_len)
 92 |     min_len = int(SEQ_LEN*get_args().min_len_ratio)
 93 |     bert_len = bert_len.loc[bert_len >= min_len]
 94 |     logger.info(f'Filter {old_len - len(bert_len)} records from {old_len} by threshold {min_len}')
 95 | 
 96 |     return X.iloc[bert_len.index], y[bert_len.index]
 97 | 
 98 | 
 99 | @timed()
100 | def train_base():
101 |     args = get_args()
102 |     #frac = args.frac
103 |     fold = args.fold
104 |     EPOCHS = args.epochs
105 | 
106 | 
107 |     LR = 2e-5
108 | 
109 |     BATCH_SIZE = get_args().batch_size
110 |     with timed_bolck(f'Prepare train data#{BATCH_SIZE}, LR:{LR}'):
111 |         X, y, _ = get_train_test_bert()
112 | 
113 |         ##Begin to define model
114 |         from keras_bert import load_trained_model_from_checkpoint
115 | 
116 | 
117 |         from keras_xlnet.backend import keras
118 |         from keras_bert.layers import Extract
119 |         from keras_xlnet import PretrainedList, get_pretrained_paths
120 |         from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI
121 | 
122 |         checkpoint_path = xlnet_path
123 |         logger.info(f'Start to train base on checkpoint:{checkpoint_path}')
124 | 
125 |         # EPOCH = 10
126 |         # BATCH_SIZE = 64
127 |         # SEQ_LEN = 50
128 |         # MODEL_NAME = 'SST-2.h5'
129 |         #
130 |         # current_path = os.path.dirname(os.path.abspath(__file__))
131 |         # train_path = os.path.join(current_path, 'train.tsv')
132 |         # dev_path = os.path.join(current_path, 'dev.tsv')
133 | 
134 |         # Load pretrained model
135 | 
136 |         #vocab_path = os.path.join(checkpoint_path, 'spiece.model')
137 |         config_path = os.path.join(checkpoint_path, 'xlnet_config.json')
138 |         model_path = os.path.join(checkpoint_path, 'xlnet_model.ckpt')
139 | 
140 |         #tokenizer = Tokenizer(paths.vocab)
141 |         model = load_trained_model_from_checkpoint(
142 |             config_path=config_path,
143 |             checkpoint_path=model_path,
144 |             batch_size=BATCH_SIZE,
145 |             memory_len=0,
146 |             target_len=SEQ_LEN,
147 |             in_train_phase=False,
148 |             attention_type=ATTENTION_TYPE_BI,
149 |         )
150 | 
151 |         # Build classification model
152 |         last = Extract(index=-1, name='Extract')(model.output)
153 |         dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last)
154 |         dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense)
155 |         output = keras.layers.Dense(units=num_classes, activation='softmax', name='Softmax')(dropout)
156 |         model = keras.models.Model(inputs=model.inputs, outputs=output)
157 |         model.summary()
158 | 
159 |         model.compile(
160 |             optimizer=keras.optimizers.Adam(lr=LR),
161 |             loss='categorical_crossentropy',
162 |             metrics=['accuracy'],
163 |         )
164 | 
165 |         ##End to define model
166 | 
167 |         input1_col = [col for col in X.columns if str(col).startswith('bert_')]
168 |         #input2_col = [col for col in X.columns if str(col).startswith('fea_')]
169 |         #max_words = len(input1_col)
170 |         model #= get_model(max_words)
171 | 
172 |         #get_feature_manual.cache_clear()
173 |         Y_cat = keras.utils.to_categorical(y, num_classes=num_classes)
174 |         #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
175 | 
176 |     with timed_bolck(f'Training#{fold}'):
177 |         from core.split import split_df_by_index
178 |         train_idx, test_idx = split_df_by_index(X,fold)
179 | 
180 |         trunc_len_tran = 64*(len(train_idx)//64)
181 |         trunc_len_val = 64 * (len(test_idx) // 64)
182 | 
183 |         train_idx = train_idx[:trunc_len_tran]
184 |         test_idx = test_idx[:trunc_len_val]
185 | 
186 |         logger.info(f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}')
187 |         train_x, train_y, val_x, val_y = \
188 |             X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx]
189 | 
190 |         logger.info(f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ')
191 | 
192 |         #train_x, train_y = filter_short_desc(train_x, train_y)
193 | 
194 |         input1 = train_x.loc[:, input1_col]#.astype(np.float32)
195 |         input2 = np.zeros_like(input1)#.astype(np.int8)
196 |         input3  =  np.zeros_like(input1.iloc[:, :1])
197 | 
198 |         val1 = val_x.loc[:, input1_col]
199 |         val2 = np.zeros_like(val1)
200 |         val3 = np.zeros_like(val1.iloc[:,:1])
201 | 
202 |         logger.info(f'NN train_x:{train_x[:3]}')
203 |         min_len_ratio = get_args().min_len_ratio
204 |         max_bin = get_args().max_bin
205 |         logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}, SEQ_LEN:{SEQ_LEN}, min_len_ratio:{min_len_ratio}, bin:{max_bin} ')
206 | 
207 |         from keras_bert import get_custom_objects
208 |         from tensorflow.python.keras.callbacks import EarlyStopping
209 |         import tensorflow as tf
210 | 
211 |         es = EarlyStopping(monitor='val_acc',patience=2, verbose=1)
212 |         with tf.keras.utils.custom_object_scope(get_custom_objects()):
213 |             his = model.fit([input1, input2, input3], train_y,
214 |                             validation_data = ([val1, val2, val3 ],
215 |                                                val_y),
216 |                             epochs=EPOCHS,  shuffle=True, batch_size=BATCH_SIZE,
217 |                             callbacks=[Cal_acc( val_x, y.iloc[test_idx] ), es]
218 |                       #steps_per_epoch=1000, validation_steps=10
219 |                       )
220 | 
221 | 
222 | 
223 |             #gen_sub(model, X_test, sn)
224 | 
225 |     return his
226 | 
227 | from tensorflow.python.keras.callbacks import Callback
228 | 
229 | 
230 | class Cal_acc(Callback):
231 | 
232 |     def __init__(self, val_x, y):
233 |         super(Cal_acc, self).__init__()
234 |         self.val_x , self.y = val_x, y
235 |         self.min_len = int(SEQ_LEN*get_args().min_len_ratio)
236 |         self.max_bin = get_args().max_bin
237 |         self.fold = get_args().fold
238 |         self.threshold = 0
239 |         self.feature_len = self.val_x.shape[1]
240 |         self.cur_epoch = 0
241 |         self.version = get_args().version
242 |         self.trial = get_args().trial
243 | 
244 |         self.max_score = 0
245 | 
246 |         self.score_list = np.zeros(get_args().epochs)
247 |         self.gen_file = False
248 | 
249 |         import time, os
250 |         self.batch_id = round(time.time())
251 |         self.model_folder = f'./output/model/{self.batch_id}/'
252 | 
253 |         os.makedirs(self.model_folder)
254 | 
255 | 
256 |         #logger.info(f'Cal_acc base on X:{self.X.shape}, Y:{self.y.shape}')
257 | 
258 |     #@timed()
259 |     def cal_acc(self):
260 |         input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')]
261 |         #input2_col = [col for col in self.val_x.columns if str(col).startswith('fea_')]
262 |         #model = self.model
263 |         #tmp_val =
264 |         tmp_y = self.y
265 | 
266 |         input1 = self.val_x.loc[:,input1_col]  # .astype(np.float32)
267 |         input2 = np.zeros_like(input1)  # .astype(np.int8)
268 |         input3 = np.zeros_like(input1.iloc[:, :1])
269 | 
270 |         val = self.model.predict([input1, input2, input3])
271 | 
272 |         label2id, id2label = get_label_id()
273 |         val = pd.DataFrame(val, columns=label2id.keys(), index=input1.index)
274 |         val['label'] = tmp_y.astype(int).replace(id2label).astype(int)
275 |         val['bin'] = pd.Series(val.index).str[-1].values.astype(int)
276 |         #logger.info(f'Head val#label:\n{val.label.head()}')
277 |         res_val = val.copy()
278 |         # res_val.to_pickle(f'./output/tmp_res_val.pkl')
279 |         # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl')
280 | 
281 |         num_labels = 10
282 |         df_score = val.loc[val.bin==0]
283 |         score_list = accuracy(df_score, num_labels, f'no{self.cur_epoch},b{self.max_bin},{self.version}')
284 | 
285 |         logger.info(f'{len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}')
286 | 
287 |         return score_list,res_val
288 | 
289 |     @timed()
290 |     def cal_acc_ex(self):
291 |         input1_col = [col for col in self.val_x.columns if str(col).startswith('bert_')]
292 | 
293 |         if self.trial==0:
294 |             check_type_list =['val']
295 |         for type_ in tqdm(check_type_list,desc='cal_acc_ex'):
296 |             tmp_val ,tmp_y = self.get_tmp_val_test(type_)
297 |             #tmp_val = tmp_val.loc[:, input1_col]
298 | 
299 |             input1 = tmp_val.loc[:, input1_col]  # .astype(np.float32)
300 |             input2 = np.zeros_like(input1)  # .astype(np.int8)
301 |             input3 = np.zeros_like(input1.iloc[:, :1])
302 |             logger.info(f'{input1.shape},{input2.shape},{input3.shape}')
303 |             logger.info(input1[:3])
304 |             # logger.info(input2[:3])
305 |             # logger.info(input3[:3])
306 |             val = self.model.predict([input1,input2,input3])
307 | 
308 |             label2id, id2label = get_label_id()
309 |             val = pd.DataFrame(val, columns=label2id.keys(), index=tmp_val.index)
310 |             val['label'] = tmp_y.astype(int).replace(id2label).astype(int)
311 |             val['bin'] = pd.Series(val.index).str[-1].values.astype(int)
312 |             # logger.info(f'Head val#label:\n{val.label.head()}')
313 |             res_val = val.copy()
314 |             # res_val.to_pickle(f'./output/tmp_res_val.pkl')
315 |             # logger.info(f'Debug file: save to ./output/tmp_res_val.pkl')
316 | 
317 |             num_labels = 10
318 |             df_score = val.loc[val.bin == 0]
319 |             score_list = accuracy(df_score, num_labels, f'ex{self.cur_epoch},{self.version},b{self.max_bin},{type_}')
320 | 
321 |             logger.info(f'===cal_acc_ex{self.cur_epoch}:{type_}==={len(df_score)}/{len(res_val)}, fold:{self.fold}, score for label1-f{num_labels}:{score_list}')
322 | 
323 |         return score_list, res_val
324 | 
325 | 
326 |     @lru_cache()
327 |     @timed()
328 |     def get_tmp_val_test(self, type_):
329 |         _, _, test_all = get_train_test_bert()
330 | 
331 |         test = test_all.loc[pd.Series(test_all.index).str.startswith(type_).values]
332 | 
333 |         test = test.loc[(pd.Series(test.index).str[-1]=='0').values]
334 | 
335 |         logger.info(f'Split {type_}, {len(test)} rows from {len(test_all)}')
336 | 
337 |         test=test.copy()
338 |         type_ = 'x'*6 + pd.Series(test.index).str[:6]
339 |         test.index = 'x'*6 + pd.Series(test.index).str[6:]
340 | 
341 |         from spider.mi import get_train_ph2_index
342 |         train_ph2 =  get_train_ph2_index()
343 |         #final = final.loc[final.type_id.str.len() >= 1]
344 |         train_ph2.index = 'x'*6 + train_ph2['id'].str[6:]
345 |         #Align label with input test
346 |         index_old = test.index.copy()
347 |         test.index = pd.Series(test.index).apply(lambda val: val[:32])
348 | 
349 |         label = train_ph2.type_id.loc[test.index.values].str[:6] #type_id len is 6
350 | 
351 |         #Rollback index change
352 |         test.index = index_old
353 |         label.index = index_old
354 | 
355 |         test = test.loc[pd.notna(label).values]
356 |         label = label.dropna()
357 |         print('test, label, type_', test.shape, label.shape, type_.shape)
358 |         return test, label#, type_
359 | 
360 | 
361 |     def on_train_end(self, logs=None):
362 |         grow= max(self.score_list) - self.threshold
363 |         cut_ratio = get_args().cut_ratio
364 |         logger.info(f'Train END: Fold:{self.fold}, max:{max(self.score_list):7.6f}/{grow:+6.5f}, at {np.argmax(self.score_list)}/{len(self.score_list)-1}, his:{self.score_list}, max_bin:{self.max_bin}, cut:{cut_ratio}, min_len:{self.min_len:03}, SEQ_LEN:{SEQ_LEN:03}, threshold:{self.threshold:7.6f}, gen_file:{self.gen_file}')
365 |         logger.info(f'Input args:{get_args()}')
366 | 
367 |     def on_epoch_end(self, epoch, logs=None):
368 |         self.cur_epoch = epoch
369 |         print('\n')
370 |         _, _ = self.cal_acc_ex()
371 | 
372 |         if self.trial > 0:
373 |             return 0
374 |         else:
375 |             score_list, val = self.cal_acc()
376 |             total = score_list[1]
377 | 
378 |             self.score_list[epoch] = round(total, 6)
379 |             #threshold_map = {0:0.785, 1:0.77, 2:0.77, 3:0.77, 4:0.78}
380 |             top_cnt =2
381 |             top_score = self._get_top_score(self.fold)[:top_cnt]
382 |             self.threshold = top_score[-1] if len(top_score) == top_cnt else 0
383 |             logger.info(f'The top#{top_cnt} score for max_bin:{get_args().max_bin}, epoch:{epoch}, oof:{oof_prefix}, fold#{self.fold} is:{top_score}, cur_score:{total}, threshold:{self.threshold}')
384 |             if ( round(total,4) > round(self.threshold,4)
385 |                  and (epoch>=3 or self.threshold > 0 or total>0.83 )
386 |                  and total > max(self.max_score, 0.83)
387 |                 ) :
388 |                 #logger.info(f'Try to gen sub file for local score:{total}, and save to:{model_path}')
389 |                 self.gen_file=True
390 |                 grow = max(self.score_list) - self.threshold
391 |                 logger.info(f'Fold:{self.fold}, epoch:{epoch}, MAX:{max(self.score_list):7.6f}/{grow:+6.5f}, threshold:{self.threshold}, score_list:{self.score_list}' )
392 |                 test = self.gen_sub(self.model, f'{self.feature_len}_{total:7.6f}_{epoch}_f{self.fold}')
393 |                 len_raw_val = len(val.loc[val.bin == 0])
394 |                 min_len_ratio = get_args().min_len_ratio
395 |                 oof_file = f'./output/stacking/{oof_prefix}_{self.fold}_{total:7.6f}_{len_raw_val}_{len(val):05}_b{get_args().max_bin}_e{epoch}_{self.batch_id}_m{min_len_ratio:2.1f}_L{SEQ_LEN:03}_XL.h5'
396 |                 self.save_stack_feature(val, test, oof_file)
397 |             else:
398 |                 logger.info(f'Epoch:{epoch}, only gen sub file if the local score >{self.threshold}, current score:{total}, threshold:{self.threshold}, max_score:{self.max_score}')
399 | 
400 |             self.max_score = max(self.max_score, total)
401 | 
402 |             logger.info(f'Epoch#{epoch} END,max_bin:{get_args().max_bin}, oof:{oof_prefix}, max:{self.max_score:6.5f}, score:{score_list}, Fold:{self.fold},')
403 | 
404 |             print('\n')
405 | 
406 |             return round(total, 5)
407 | 
408 |     @staticmethod
409 |     @timed()
410 |     def save_stack_feature(train: pd.DataFrame, test: pd.DataFrame, file_path):
411 |         train.bin = train.bin.astype(int)
412 |         test.bin = test.bin.astype(int)
413 |         train.to_hdf(file_path, 'train', mode='a')
414 |         test.to_hdf(file_path, 'test', mode='a')
415 |         logger.info(f'OOF file save to :{file_path}')
416 |         return train, test
417 | 
418 | 
419 |     @timed()
420 |     #./output/model/1562899782/model_6114_0.65403_2.h5
421 |     def gen_sub(self, model , info='bert_' , partition_len = 5000):
422 | 
423 |         #frac = get_args().frac
424 |         _, _, test = get_train_test_bert()
425 | 
426 |         label2id, id2label = get_label_id()
427 |         input1_col = [col for col in test.columns if str(col).startswith('bert_')]
428 |         input3_col = [col for col in test.columns if str(col).startswith('fea_')]
429 | 
430 |         logger.info(f'Input input1_col:{len(input1_col)}, input3_col:{len(input3_col)}')
431 |         res_list = []
432 |         for sn in tqdm(range(1+ len(test)//partition_len), desc=f'{info}:sub:total:{len(test)},partition_len:{partition_len}'):
433 |             tmp = test.iloc[sn*partition_len: (sn+1)*partition_len]
434 |             #print('\nbegin tmp\n', tmp.iloc[:3,:3].head())
435 |             input1 = tmp.loc[:,input1_col]
436 |             input2 = np.zeros_like(input1)  # .astype(np.int8)
437 |             input3 = np.zeros_like(input1.iloc[:, :1])
438 |             res = model.predict([ input1, input2, input3])
439 |             res = pd.DataFrame(res, columns=label2id.keys(), index=tmp.index)
440 |             #print('\nend tmp\n', res.iloc[:3, :3].head())
441 |             res_list.append(res)
442 | 
443 |         res = pd.concat(res_list)
444 |         res['bin'] = res.index.str[-1].values.astype(int)
445 |         raw_predict = res.copy()
446 | 
447 |         with timed_bolck(f'Try to gen sub file for fold#{self.fold}'):
448 |             #print('\nafter concat\n', res.iloc[:3, :3].head())
449 |             res['id'] = res.index
450 |             res.index.name = 'id'
451 |             # res.to_pickle(f'./output/tmp_sub.pkl')
452 | 
453 | 
454 |             #print('\nend res\n', res.iloc[:3, :3].head())
455 | 
456 | 
457 | 
458 |             res_mean = res.copy(deep=True)
459 |             res_mean['id'] = res_mean.id.apply(lambda val: val.split('_')[0])
460 |             res_mean.index.name = 'index'
461 |             res_select = res_mean.groupby('id')['bin'].agg({'bin_max': 'max'})
462 |             res_select.head()
463 |             res_select = res_select.loc[res_select.bin_max == 3]
464 |             res_mean = res_mean.loc[(res_mean.bin == 0)
465 |                                     | ((res_mean.bin == 1) & (res_mean.id.isin(res_select.index)))
466 |                                     ]
467 |             logger.info(f'Try to cal avg for res_mean:\n{res_mean.bin.value_counts()}')
468 |             res_mean_len = len(res_mean)
469 |             res_mean = res_mean.groupby('id').mean().sort_index()
470 |             del res_mean['bin']
471 | 
472 | 
473 |             res_0 = res.copy(deep=True)
474 |             res_0 = res_0.loc[res_0.bin == 0]
475 |             res_0.index  = res_0.id.apply(lambda val: val.split('_')[0])
476 |             #print('\nres_0\n', res_0.loc[:, ['id', 'bin']].head(3))
477 |             res_0 = res_0.sort_index()
478 |             res_0 = res_0.drop(columns=['id','bin'], axis=1, errors='ignore')
479 | 
480 |             for name, res in [('single',res_0), (f'mean_{res_mean_len}', res_mean)]:
481 |                 res = res.copy()
482 |                 #logger.info(f'{name} Check:\n{res.iloc[:3,:num_classes].sum(axis=1)}')
483 | 
484 |                 res['label1'] = res.iloc[:, :num_classes].idxmax(axis=1)
485 | 
486 |                 # Exclude top#1
487 |                 for index, col in res.label1.items():
488 |                     res.loc[index, col] = np.nan
489 | 
490 |                 res['label2'] = res.iloc[:, :num_classes].idxmax(axis=1)
491 | 
492 | 
493 |                 for col in ['label1','label2']:
494 |                     res[col] = res[col].replace(id2label)
495 | 
496 |                 # info = info.replace('.','')
497 |                 # sub_file = f'./output/sub/v19_{info}_{name}.csv'
498 |                 # res[['label1', 'label2']].to_csv(sub_file)
499 |                 # logger.info(f'Sub file save to :{sub_file}')
500 | 
501 |             #logger.info(f'res_0 Check:\n{res_0.iloc[:3, :num_classes].sum(axis=1)}')
502 | 
503 |         return raw_predict #res.drop(columns=['id','bin'], axis=1, errors='ignore')
504 | 
505 |     @staticmethod
506 |     def _get_top_score(fold):
507 |         from glob import glob
508 |         file_list = sorted(glob(f'./output/stacking/{oof_prefix}_{fold}_*.h5'), reverse=True)
509 |         score_list = [float(file.split('_')[2].replace('.h5', '')) for file in file_list]
510 |         logger.info(f'Score list for fold#{fold} is {score_list}')
511 |         return score_list if score_list else [0]
512 | 
513 | if __name__ == '__main__':
514 |     FUNCTION_MAP = {'train_base': train_base,
515 |                     }
516 | 
517 |     args = get_args()
518 | 
519 |     func = FUNCTION_MAP[args.command]
520 |     func()
521 | 
522 | """
523 | 
524 | nohup python -u ./core/bert.py --frac=0.1  train_base  > test.log 2>&1 &
525 | 
526 | nohup python -u ./core/bert.py --fold=4 --max_bin=2 train_base  > test_4.log 2>&1 &
527 | 
528 | python -u ./core/bert.py --max_bin=2 train_base 
529 | 
530 | nohup python -u ./core/bert.py train_base  > test.log 2>&1 &
531 | 
532 | nohup python -u ./core/bert.py train_base  > extend_bert_mean_bin_1.log 2>&1 &
533 | 
534 | nohup python -u ./core/bert.py boost_train 10 >> boost_1.log 2>&1 &
535 | 
536 | """


--------------------------------------------------------------------------------
/input/readme.txt:
--------------------------------------------------------------------------------
 1 | 如果此文件夹是空,需要运行下列命令生成input文件
 2 | 
 3 | # 爬取数据
 4 | nohup python -u  spider/mi.py bd  > bd.log 2>&1&
 5 | nohup python -u  spider/mi.py wdj  > wdj.log 2>&1&
 6 | nohup python -u  spider/mi.py xm  >  xm.log 2>&1&
 7 | nohup python -u  spider/mi.py 360  >  360.log 2>&1&
 8 | 
 9 | nohup python -u  spider/mi.py tx_pkg  >  tx_pkg.log 2>&1&
10 | nohup python -u  spider/mi.py tx_name  >  tx_name.log 2>&1&
11 | 
12 | nohup python -u  spider/mi.py bdsj  >  bdsj.log 2>&1&
13 | 
14 | 
15 | # 生成input数据
16 | ./bin/clean.sh
17 | 


--------------------------------------------------------------------------------
/notebook/.ipynb_checkpoints/word_analysis_local-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "2019-07-04 22:42:21,041 util_log.py[128] INFO Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n",
 13 |       "2019-07-04 22:42:21,045 util_pandas.py[19] WARNING \"No such keys(s): 'display.height'\"\n"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "name": "stdout",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "yes\n",
 21 |       "/Users/lali2/Documents/workspace_py/xf_tag\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import sys\n",
 27 |     "import os\n",
 28 |     "os.chdir('../')\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "import pandas as pd\n",
 32 |     "import numpy as np\n",
 33 |     "\n",
 34 |     "from bokeh.palettes import Category10\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "from tqdm import tqdm\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "from file_cache.utils.util_pandas import *\n",
 41 |     "from file_cache.cache import file_cache\n",
 42 |     "from functools import lru_cache\n",
 43 |     "from glob import glob\n",
 44 |     "\n",
 45 |     "%matplotlib inline\n",
 46 |     "from core.conf import *\n",
 47 |     "!pwd"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 62,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "(94354, 2)\n"
 60 |      ]
 61 |     },
 62 |     {
 63 |      "data": {
 64 |       "text/html": [
 65 |        "<div>\n",
 66 |        "<style scoped>\n",
 67 |        "    .dataframe tbody tr th:only-of-type {\n",
 68 |        "        vertical-align: middle;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe tbody tr th {\n",
 72 |        "        vertical-align: top;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe thead th {\n",
 76 |        "        text-align: right;\n",
 77 |        "    }\n",
 78 |        "</style>\n",
 79 |        "<table border=\"1\" class=\"dataframe\">\n",
 80 |        "  <thead>\n",
 81 |        "    <tr style=\"text-align: right;\">\n",
 82 |        "      <th></th>\n",
 83 |        "      <th>app_id</th>\n",
 84 |        "      <th>app_des</th>\n",
 85 |        "    </tr>\n",
 86 |        "  </thead>\n",
 87 |        "  <tbody>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>0</th>\n",
 90 |        "      <td>BB29DA6F8167CFC99E0853741C4EB17B</td>\n",
 91 |        "      <td>注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏，你用你自己的音乐比赛...</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>1</th>\n",
 95 |        "      <td>BB2A78EA7AD4945EAF6E38997F6139A3</td>\n",
 96 |        "      <td>定位试衣到家是一款基于地理位置，提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>2</th>\n",
100 |        "      <td>BB2B1604CFA079C289FECF927DFBCE89</td>\n",
101 |        "      <td>想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质，邀您...</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>3</th>\n",
105 |        "      <td>BB2C7BD0B0623644183DAD08A89E1D90</td>\n",
106 |        "      <td>闽通宝手机客户端是基于移动互联网的，以公众出行服务为基础，贯彻绿色出行，低碳生活的理念，为出...</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>4</th>\n",
110 |        "      <td>BB2E1A8F56158E483D7461E930E6332F</td>\n",
111 |        "      <td>风靡全球的DIY照片桌面，干净、流畅，启动提速100，瞬间提升手机性能；更是一亿用户的共同选...</td>\n",
112 |        "    </tr>\n",
113 |        "  </tbody>\n",
114 |        "</table>\n",
115 |        "</div>"
116 |       ],
117 |       "text/plain": [
118 |        "                             app_id  \\\n",
119 |        "0  BB29DA6F8167CFC99E0853741C4EB17B   \n",
120 |        "1  BB2A78EA7AD4945EAF6E38997F6139A3   \n",
121 |        "2  BB2B1604CFA079C289FECF927DFBCE89   \n",
122 |        "3  BB2C7BD0B0623644183DAD08A89E1D90   \n",
123 |        "4  BB2E1A8F56158E483D7461E930E6332F   \n",
124 |        "\n",
125 |        "                                             app_des  \n",
126 |        "0  注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏，你用你自己的音乐比赛...  \n",
127 |        "1  定位试衣到家是一款基于地理位置，提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...  \n",
128 |        "2  想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质，邀您...  \n",
129 |        "3  闽通宝手机客户端是基于移动互联网的，以公众出行服务为基础，贯彻绿色出行，低碳生活的理念，为出...  \n",
130 |        "4  风靡全球的DIY照片桌面，干净、流畅，启动提速100，瞬间提升手机性能；更是一亿用户的共同选...  "
131 |       ]
132 |      },
133 |      "execution_count": 62,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "get_word_cnt"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 63,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "(152, 2)\n"
152 |      ]
153 |     },
154 |     {
155 |      "data": {
156 |       "text/html": [
157 |        "<div>\n",
158 |        "<style scoped>\n",
159 |        "    .dataframe tbody tr th:only-of-type {\n",
160 |        "        vertical-align: middle;\n",
161 |        "    }\n",
162 |        "\n",
163 |        "    .dataframe tbody tr th {\n",
164 |        "        vertical-align: top;\n",
165 |        "    }\n",
166 |        "\n",
167 |        "    .dataframe thead th {\n",
168 |        "        text-align: right;\n",
169 |        "    }\n",
170 |        "</style>\n",
171 |        "<table border=\"1\" class=\"dataframe\">\n",
172 |        "  <thead>\n",
173 |        "    <tr style=\"text-align: right;\">\n",
174 |        "      <th></th>\n",
175 |        "      <th>type_id</th>\n",
176 |        "      <th>type_name</th>\n",
177 |        "    </tr>\n",
178 |        "  </thead>\n",
179 |        "  <tbody>\n",
180 |        "    <tr>\n",
181 |        "      <th>0</th>\n",
182 |        "      <td>1401</td>\n",
183 |        "      <td>便捷生活</td>\n",
184 |        "    </tr>\n",
185 |        "    <tr>\n",
186 |        "      <th>1</th>\n",
187 |        "      <td>1402</td>\n",
188 |        "      <td>游戏</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>2</th>\n",
192 |        "      <td>1403</td>\n",
193 |        "      <td>通讯社交</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>3</th>\n",
197 |        "      <td>1404</td>\n",
198 |        "      <td>阅读</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>4</th>\n",
202 |        "      <td>1405</td>\n",
203 |        "      <td>工作求职</td>\n",
204 |        "    </tr>\n",
205 |        "  </tbody>\n",
206 |        "</table>\n",
207 |        "</div>"
208 |       ],
209 |       "text/plain": [
210 |        "   type_id type_name\n",
211 |        "0     1401      便捷生活\n",
212 |        "1     1402        游戏\n",
213 |        "2     1403      通讯社交\n",
214 |        "3     1404        阅读\n",
215 |        "4     1405      工作求职"
216 |       ]
217 |      },
218 |      "execution_count": 63,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "app_type = pd.read_csv(f'{input_dir}/apptype_id_name.txt', delimiter='\\t', names =['type_id', 'type_name'] )\n",
225 |     "print(app_type.shape)\n",
226 |     "app_type.head()"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 65,
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "name": "stdout",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "(30000, 3)\n"
239 |      ]
240 |     },
241 |     {
242 |      "data": {
243 |       "text/html": [
244 |        "<div>\n",
245 |        "<style scoped>\n",
246 |        "    .dataframe tbody tr th:only-of-type {\n",
247 |        "        vertical-align: middle;\n",
248 |        "    }\n",
249 |        "\n",
250 |        "    .dataframe tbody tr th {\n",
251 |        "        vertical-align: top;\n",
252 |        "    }\n",
253 |        "\n",
254 |        "    .dataframe thead th {\n",
255 |        "        text-align: right;\n",
256 |        "    }\n",
257 |        "</style>\n",
258 |        "<table border=\"1\" class=\"dataframe\">\n",
259 |        "  <thead>\n",
260 |        "    <tr style=\"text-align: right;\">\n",
261 |        "      <th></th>\n",
262 |        "      <th>app_id</th>\n",
263 |        "      <th>type_id</th>\n",
264 |        "      <th>app_des</th>\n",
265 |        "    </tr>\n",
266 |        "  </thead>\n",
267 |        "  <tbody>\n",
268 |        "    <tr>\n",
269 |        "      <th>0</th>\n",
270 |        "      <td>00000777CE5B5AA5C1AC94DB8EABE0AC</td>\n",
271 |        "      <td>140203</td>\n",
272 |        "      <td>《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...</td>\n",
273 |        "    </tr>\n",
274 |        "    <tr>\n",
275 |        "      <th>1</th>\n",
276 |        "      <td>0000DEC36E15C27DBFC64AB8208C4B37</td>\n",
277 |        "      <td>140206</td>\n",
278 |        "      <td>更稳定、更优质，邀您一起。</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>2</th>\n",
282 |        "      <td>0001791406307B1D1CE2BC64A830B7C7</td>\n",
283 |        "      <td>142106</td>\n",
284 |        "      <td>《小钱袋》是一款免费网络版记帐软件，适用于个人记帐、家庭记帐、团队记帐，全程帮您安全记录您财...</td>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>3</th>\n",
288 |        "      <td>0002F14825B9CA01653325EEFD69D790</td>\n",
289 |        "      <td>142701</td>\n",
290 |        "      <td>领先的周易服务平台高人汇，汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>4</th>\n",
294 |        "      <td>000419D79365331F89399E5F38A91B05</td>\n",
295 |        "      <td>140901</td>\n",
296 |        "      <td>平行空间是一款极简、免费的黑科技双开助手；您可以在平行空间双开微信微博、陌陌、映客、yy等应...</td>\n",
297 |        "    </tr>\n",
298 |        "  </tbody>\n",
299 |        "</table>\n",
300 |        "</div>"
301 |       ],
302 |       "text/plain": [
303 |        "                             app_id type_id  \\\n",
304 |        "0  00000777CE5B5AA5C1AC94DB8EABE0AC  140203   \n",
305 |        "1  0000DEC36E15C27DBFC64AB8208C4B37  140206   \n",
306 |        "2  0001791406307B1D1CE2BC64A830B7C7  142106   \n",
307 |        "3  0002F14825B9CA01653325EEFD69D790  142701   \n",
308 |        "4  000419D79365331F89399E5F38A91B05  140901   \n",
309 |        "\n",
310 |        "                                             app_des  \n",
311 |        "0  《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...  \n",
312 |        "1                                      更稳定、更优质，邀您一起。  \n",
313 |        "2  《小钱袋》是一款免费网络版记帐软件，适用于个人记帐、家庭记帐、团队记帐，全程帮您安全记录您财...  \n",
314 |        "3  领先的周易服务平台高人汇，汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...  \n",
315 |        "4  平行空间是一款极简、免费的黑科技双开助手；您可以在平行空间双开微信微博、陌陌、映客、yy等应...  "
316 |       ]
317 |      },
318 |      "execution_count": 65,
319 |      "metadata": {},
320 |      "output_type": "execute_result"
321 |     }
322 |    ],
323 |    "source": [
324 |     "import csv\n",
325 |     "apptype_train  = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\\t', \n",
326 |     "                             names =['app_id', 'type_id', 'app_des'] , \n",
327 |     "                             quoting=3\n",
328 |     "                            )\n",
329 |     "print(apptype_train.shape)\n",
330 |     "apptype_train.head()\n",
331 |     "#apptype_train.iloc[2,2]"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 66,
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "data": {
341 |       "text/html": [
342 |        "<div>\n",
343 |        "<style scoped>\n",
344 |        "    .dataframe tbody tr th:only-of-type {\n",
345 |        "        vertical-align: middle;\n",
346 |        "    }\n",
347 |        "\n",
348 |        "    .dataframe tbody tr th {\n",
349 |        "        vertical-align: top;\n",
350 |        "    }\n",
351 |        "\n",
352 |        "    .dataframe thead th {\n",
353 |        "        text-align: right;\n",
354 |        "    }\n",
355 |        "</style>\n",
356 |        "<table border=\"1\" class=\"dataframe\">\n",
357 |        "  <thead>\n",
358 |        "    <tr style=\"text-align: right;\">\n",
359 |        "      <th></th>\n",
360 |        "      <th>app_id</th>\n",
361 |        "      <th>type_id</th>\n",
362 |        "      <th>app_des</th>\n",
363 |        "    </tr>\n",
364 |        "  </thead>\n",
365 |        "  <tbody>\n",
366 |        "    <tr>\n",
367 |        "      <th>15945</th>\n",
368 |        "      <td>63959834D8FB9D68C03A75C9BB0906EA</td>\n",
369 |        "      <td>140206</td>\n",
370 |        "      <td>在全新的地图中，你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团，肩负起守卫天空之城的重任，同...</td>\n",
371 |        "    </tr>\n",
372 |        "    <tr>\n",
373 |        "      <th>15946</th>\n",
374 |        "      <td>63961F67B88D3D7D877101F80A53E5CD</td>\n",
375 |        "      <td>140901</td>\n",
376 |        "      <td>部分小错误，整体。</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>15947</th>\n",
380 |        "      <td>6396C70B6383F0BF243EF69927ACF35F</td>\n",
381 |        "      <td>140901</td>\n",
382 |        "      <td>以太大陆EthMin 以太大陆是一个数字生态世界，帮助个体管理在现实世界中所付出的努力与贡献...</td>\n",
383 |        "    </tr>\n",
384 |        "    <tr>\n",
385 |        "      <th>15948</th>\n",
386 |        "      <td>6396F4C27E1F1D86762B9283D701DB78</td>\n",
387 |        "      <td>142501</td>\n",
388 |        "      <td>帮助准妈妈在分娩前记录宫缩频率和时长，以判断是否达到就医标准。遇到问题，可1对1在线咨询产科...</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>15949</th>\n",
392 |        "      <td>63997AB7F3E277BC0CB1D42C3D8360F4</td>\n",
393 |        "      <td>142103</td>\n",
394 |        "      <td>线上线下优势资源整合  不必四处奔波，专属咨询顾问为您服务。  安心快速无抵押  去繁求简，...</td>\n",
395 |        "    </tr>\n",
396 |        "    <tr>\n",
397 |        "      <th>15950</th>\n",
398 |        "      <td>639B889103E0AFD7D23E8C593DB6A6D1</td>\n",
399 |        "      <td>140211</td>\n",
400 |        "      <td>更稳定、更优质，邀您一起。</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>15951</th>\n",
404 |        "      <td>639BC48DB51B5806B726B392224F0CA8</td>\n",
405 |        "      <td>142102</td>\n",
406 |        "      <td>金钱永不眠一个股票账户，一笔钱投资美股/港股/英股/A股；全球资产配置的一站式股票平台享受一...</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>15952</th>\n",
410 |        "      <td>639C08D6CA2142E0CFD60E64DFB7C326</td>\n",
411 |        "      <td>140901</td>\n",
412 |        "      <td>文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成，一款专业进行文字转语...</td>\n",
413 |        "    </tr>\n",
414 |        "    <tr>\n",
415 |        "      <th>15953</th>\n",
416 |        "      <td>639C9663BB3CABFA048B3A54ED9B8CC9</td>\n",
417 |        "      <td>140401</td>\n",
418 |        "      <td>在微博，官方发布新闻，草根爆料八卦；在微博，大V明星发布动态，粉丝狗仔爆料内幕；在微博，海量...</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>15954</th>\n",
422 |        "      <td>639DBC25084151D681F73C1A331B6CBA</td>\n",
423 |        "      <td>140210</td>\n",
424 |        "      <td>比斗地主麻将更简单、比炸金花牛牛更刺激，全球百姓共同推荐锻炼情商智商，大奖话费信手拈来。客官...</td>\n",
425 |        "    </tr>\n",
426 |        "    <tr>\n",
427 |        "      <th>15955</th>\n",
428 |        "      <td>63A0262B4C6D416DC2816B15E716C31D</td>\n",
429 |        "      <td>142103</td>\n",
430 |        "      <td>\"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包，帮您闪电周转...</td>\n",
431 |        "    </tr>\n",
432 |        "    <tr>\n",
433 |        "      <th>15956</th>\n",
434 |        "      <td>63A09CBD3873CE47A42BC285705B8431</td>\n",
435 |        "      <td>140901</td>\n",
436 |        "      <td>这是一款优客工场官方发布的移动客户端，涵盖了优客工场支持的所有辅助功能，旨在为你和你的团队提...</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>15957</th>\n",
440 |        "      <td>63A0C80FD1F955F8C53AFE69291EC652</td>\n",
441 |        "      <td>140107</td>\n",
442 |        "      <td>天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>15958</th>\n",
446 |        "      <td>63A2BCCA93BB9ED948A2892CFEF4CFCE</td>\n",
447 |        "      <td>140207</td>\n",
448 |        "      <td>喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...</td>\n",
449 |        "    </tr>\n",
450 |        "    <tr>\n",
451 |        "      <th>15959</th>\n",
452 |        "      <td>63A9AFD1952D1EF58A3743CA5BD76602</td>\n",
453 |        "      <td>140206</td>\n",
454 |        "      <td>住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>15960</th>\n",
458 |        "      <td>63AA2D5AFFD768100625F947BA030B48</td>\n",
459 |        "      <td>142105</td>\n",
460 |        "      <td>一款信息查询辅助应用软件，服务广大福彩彩民，为福彩中心的各种促销活动提供平台支持。主要功能纸...</td>\n",
461 |        "    </tr>\n",
462 |        "    <tr>\n",
463 |        "      <th>15961</th>\n",
464 |        "      <td>63AB66CD8D27C6B269F4960FB530AA76</td>\n",
465 |        "      <td>142104</td>\n",
466 |        "      <td>国美金融App简介国美金融App是综合性金融服务平台，致力为个人和企业提供定制化财富管理服务...</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>15962</th>\n",
470 |        "      <td>63AD9FA5338921C66943390ADA5DCF23</td>\n",
471 |        "      <td>142102</td>\n",
472 |        "      <td>华福证券网上开户2.0系统，采用人脸单向识别技术，优化开户排队机制，可绑定老账户，为您节省宝...</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>15963</th>\n",
476 |        "      <td>63AF8C9C9E16F935F8F424533D24FD40</td>\n",
477 |        "      <td>140701</td>\n",
478 |        "      <td>软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...</td>\n",
479 |        "    </tr>\n",
480 |        "    <tr>\n",
481 |        "      <th>15964</th>\n",
482 |        "      <td>63B0E4D5A2319B7684D8959D3703B7C4</td>\n",
483 |        "      <td>140404</td>\n",
484 |        "      <td>曹雪芹著的经典文学名著《红楼梦》，特殊的幼儿早教方式，让您的孩子赢在起跑线上，大人和儿童皆宜...</td>\n",
485 |        "    </tr>\n",
486 |        "    <tr>\n",
487 |        "      <th>15965</th>\n",
488 |        "      <td>63B1568A4BA00BB36247F3FE7E63D046</td>\n",
489 |        "      <td>140210</td>\n",
490 |        "      <td>大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来，锻炼自己，让自己成为万人瞩目的象棋大师...</td>\n",
491 |        "    </tr>\n",
492 |        "    <tr>\n",
493 |        "      <th>15966</th>\n",
494 |        "      <td>63B5F7FD3037C633611E405BF76357A6</td>\n",
495 |        "      <td>140901</td>\n",
496 |        "      <td>流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...</td>\n",
497 |        "    </tr>\n",
498 |        "    <tr>\n",
499 |        "      <th>15967</th>\n",
500 |        "      <td>63B6A2A65E22AB3BEF8E6E3627058005</td>\n",
501 |        "      <td>140901</td>\n",
502 |        "      <td>三星应用商店GlxyApps是三星官方开发和运营的应用下载平台，拥有数十万款应用、游戏和主题...</td>\n",
503 |        "    </tr>\n",
504 |        "    <tr>\n",
505 |        "      <th>15968</th>\n",
506 |        "      <td>63B8474AE7D557EB69107C1C8D67293B</td>\n",
507 |        "      <td>140212</td>\n",
508 |        "      <td>海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权，对比经典抢...</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>15969</th>\n",
512 |        "      <td>63B89ACCF7E7BB4E8048A4430A61198E</td>\n",
513 |        "      <td>140603</td>\n",
514 |        "      <td>《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...</td>\n",
515 |        "    </tr>\n",
516 |        "    <tr>\n",
517 |        "      <th>15970</th>\n",
518 |        "      <td>63BA67638DB01DFD3BE2B89A6DA9C632</td>\n",
519 |        "      <td>140802</td>\n",
520 |        "      <td>乌鲁木齐地铁官方APP，为您提供全新出行方式。地铁购票&amp;mdash;&amp;mdash;乘客可通过...</td>\n",
521 |        "    </tr>\n",
522 |        "    <tr>\n",
523 |        "      <th>15971</th>\n",
524 |        "      <td>63BA6BE8E50C34BC6C08F39487BF3063</td>\n",
525 |        "      <td>140404</td>\n",
526 |        "      <td>产品简介免费追书吧，一款专业免费的电子书阅读软件，爱阅读的小伙伴千万不要错过。全本小说免费阅...</td>\n",
527 |        "    </tr>\n",
528 |        "    <tr>\n",
529 |        "      <th>15972</th>\n",
530 |        "      <td>63BBCDE7DE3AE668D03FAB004E986F4F</td>\n",
531 |        "      <td>140301|140604</td>\n",
532 |        "      <td>人人是一个火爆校园的高颜值网红美女视频交友直播平台，同城交友约会聊天，明星艺人在线直播，72...</td>\n",
533 |        "    </tr>\n",
534 |        "    <tr>\n",
535 |        "      <th>15973</th>\n",
536 |        "      <td>63BD79538A92F05644BE6AD23D87B545</td>\n",
537 |        "      <td>140603</td>\n",
538 |        "      <td>铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声，轻松...</td>\n",
539 |        "    </tr>\n",
540 |        "    <tr>\n",
541 |        "      <th>15974</th>\n",
542 |        "      <td>63BF35D999C3B21BB0E783CD56FD60D0</td>\n",
543 |        "      <td>140207</td>\n",
544 |        "      <td>《侍灵》是一款日式暗黑系的横版格斗QTE手游，在动作游戏领域中做出了大胆的创新，通过策略搭配...</td>\n",
545 |        "    </tr>\n",
546 |        "    <tr>\n",
547 |        "      <th>15975</th>\n",
548 |        "      <td>63BFFE1204509BBA9BD9E0E406FB2A38</td>\n",
549 |        "      <td>142103</td>\n",
550 |        "      <td>风云管家，信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...</td>\n",
551 |        "    </tr>\n",
552 |        "    <tr>\n",
553 |        "      <th>15976</th>\n",
554 |        "      <td>63C0F5069E829510104C56911CF571D1</td>\n",
555 |        "      <td>140207</td>\n",
556 |        "      <td>两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>15977</th>\n",
560 |        "      <td>63C5FA30A92F3B99258FA6085EE90D91</td>\n",
561 |        "      <td>141201</td>\n",
562 |        "      <td>通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...</td>\n",
563 |        "    </tr>\n",
564 |        "    <tr>\n",
565 |        "      <th>15978</th>\n",
566 |        "      <td>63CA760775B2CD3D62995F657568CC8E</td>\n",
567 |        "      <td>141001</td>\n",
568 |        "      <td>宝宝学加减法，是宝宝巴士专为5+宝贝设计的数学软件，让孩子轻松学习加减法，赢在起跑线~难度贴...</td>\n",
569 |        "    </tr>\n",
570 |        "    <tr>\n",
571 |        "      <th>15979</th>\n",
572 |        "      <td>63CB103A546C380870C8A3FA53A14208</td>\n",
573 |        "      <td>140113</td>\n",
574 |        "      <td>长安通APP是一款便民生活服务软件。用户使用长安通APP，可以实现长安通卡NFC充值、查询余...</td>\n",
575 |        "    </tr>\n",
576 |        "  </tbody>\n",
577 |        "</table>\n",
578 |        "</div>"
579 |       ],
580 |       "text/plain": [
581 |        "                                 app_id        type_id  \\\n",
582 |        "15945  63959834D8FB9D68C03A75C9BB0906EA         140206   \n",
583 |        "15946  63961F67B88D3D7D877101F80A53E5CD         140901   \n",
584 |        "15947  6396C70B6383F0BF243EF69927ACF35F         140901   \n",
585 |        "15948  6396F4C27E1F1D86762B9283D701DB78         142501   \n",
586 |        "15949  63997AB7F3E277BC0CB1D42C3D8360F4         142103   \n",
587 |        "15950  639B889103E0AFD7D23E8C593DB6A6D1         140211   \n",
588 |        "15951  639BC48DB51B5806B726B392224F0CA8         142102   \n",
589 |        "15952  639C08D6CA2142E0CFD60E64DFB7C326         140901   \n",
590 |        "15953  639C9663BB3CABFA048B3A54ED9B8CC9         140401   \n",
591 |        "15954  639DBC25084151D681F73C1A331B6CBA         140210   \n",
592 |        "15955  63A0262B4C6D416DC2816B15E716C31D         142103   \n",
593 |        "15956  63A09CBD3873CE47A42BC285705B8431         140901   \n",
594 |        "15957  63A0C80FD1F955F8C53AFE69291EC652         140107   \n",
595 |        "15958  63A2BCCA93BB9ED948A2892CFEF4CFCE         140207   \n",
596 |        "15959  63A9AFD1952D1EF58A3743CA5BD76602         140206   \n",
597 |        "15960  63AA2D5AFFD768100625F947BA030B48         142105   \n",
598 |        "15961  63AB66CD8D27C6B269F4960FB530AA76         142104   \n",
599 |        "15962  63AD9FA5338921C66943390ADA5DCF23         142102   \n",
600 |        "15963  63AF8C9C9E16F935F8F424533D24FD40         140701   \n",
601 |        "15964  63B0E4D5A2319B7684D8959D3703B7C4         140404   \n",
602 |        "15965  63B1568A4BA00BB36247F3FE7E63D046         140210   \n",
603 |        "15966  63B5F7FD3037C633611E405BF76357A6         140901   \n",
604 |        "15967  63B6A2A65E22AB3BEF8E6E3627058005         140901   \n",
605 |        "15968  63B8474AE7D557EB69107C1C8D67293B         140212   \n",
606 |        "15969  63B89ACCF7E7BB4E8048A4430A61198E         140603   \n",
607 |        "15970  63BA67638DB01DFD3BE2B89A6DA9C632         140802   \n",
608 |        "15971  63BA6BE8E50C34BC6C08F39487BF3063         140404   \n",
609 |        "15972  63BBCDE7DE3AE668D03FAB004E986F4F  140301|140604   \n",
610 |        "15973  63BD79538A92F05644BE6AD23D87B545         140603   \n",
611 |        "15974  63BF35D999C3B21BB0E783CD56FD60D0         140207   \n",
612 |        "15975  63BFFE1204509BBA9BD9E0E406FB2A38         142103   \n",
613 |        "15976  63C0F5069E829510104C56911CF571D1         140207   \n",
614 |        "15977  63C5FA30A92F3B99258FA6085EE90D91         141201   \n",
615 |        "15978  63CA760775B2CD3D62995F657568CC8E         141001   \n",
616 |        "15979  63CB103A546C380870C8A3FA53A14208         140113   \n",
617 |        "\n",
618 |        "                                                 app_des  \n",
619 |        "15945  在全新的地图中，你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团，肩负起守卫天空之城的重任，同...  \n",
620 |        "15946                                          部分小错误，整体。  \n",
621 |        "15947  以太大陆EthMin 以太大陆是一个数字生态世界，帮助个体管理在现实世界中所付出的努力与贡献...  \n",
622 |        "15948  帮助准妈妈在分娩前记录宫缩频率和时长，以判断是否达到就医标准。遇到问题，可1对1在线咨询产科...  \n",
623 |        "15949  线上线下优势资源整合  不必四处奔波，专属咨询顾问为您服务。  安心快速无抵押  去繁求简，...  \n",
624 |        "15950                                      更稳定、更优质，邀您一起。  \n",
625 |        "15951  金钱永不眠一个股票账户，一笔钱投资美股/港股/英股/A股；全球资产配置的一站式股票平台享受一...  \n",
626 |        "15952  文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成，一款专业进行文字转语...  \n",
627 |        "15953  在微博，官方发布新闻，草根爆料八卦；在微博，大V明星发布动态，粉丝狗仔爆料内幕；在微博，海量...  \n",
628 |        "15954  比斗地主麻将更简单、比炸金花牛牛更刺激，全球百姓共同推荐锻炼情商智商，大奖话费信手拈来。客官...  \n",
629 |        "15955  \"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包，帮您闪电周转...  \n",
630 |        "15956  这是一款优客工场官方发布的移动客户端，涵盖了优客工场支持的所有辅助功能，旨在为你和你的团队提...  \n",
631 |        "15957  天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...  \n",
632 |        "15958  喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...  \n",
633 |        "15959  住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...  \n",
634 |        "15960  一款信息查询辅助应用软件，服务广大福彩彩民，为福彩中心的各种促销活动提供平台支持。主要功能纸...  \n",
635 |        "15961  国美金融App简介国美金融App是综合性金融服务平台，致力为个人和企业提供定制化财富管理服务...  \n",
636 |        "15962  华福证券网上开户2.0系统，采用人脸单向识别技术，优化开户排队机制，可绑定老账户，为您节省宝...  \n",
637 |        "15963  软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...  \n",
638 |        "15964  曹雪芹著的经典文学名著《红楼梦》，特殊的幼儿早教方式，让您的孩子赢在起跑线上，大人和儿童皆宜...  \n",
639 |        "15965  大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来，锻炼自己，让自己成为万人瞩目的象棋大师...  \n",
640 |        "15966  流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...  \n",
641 |        "15967  三星应用商店GlxyApps是三星官方开发和运营的应用下载平台，拥有数十万款应用、游戏和主题...  \n",
642 |        "15968  海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权，对比经典抢...  \n",
643 |        "15969  《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...  \n",
644 |        "15970  乌鲁木齐地铁官方APP，为您提供全新出行方式。地铁购票&mdash;&mdash;乘客可通过...  \n",
645 |        "15971  产品简介免费追书吧，一款专业免费的电子书阅读软件，爱阅读的小伙伴千万不要错过。全本小说免费阅...  \n",
646 |        "15972  人人是一个火爆校园的高颜值网红美女视频交友直播平台，同城交友约会聊天，明星艺人在线直播，72...  \n",
647 |        "15973  铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声，轻松...  \n",
648 |        "15974  《侍灵》是一款日式暗黑系的横版格斗QTE手游，在动作游戏领域中做出了大胆的创新，通过策略搭配...  \n",
649 |        "15975  风云管家，信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...  \n",
650 |        "15976  两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...  \n",
651 |        "15977  通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...  \n",
652 |        "15978  宝宝学加减法，是宝宝巴士专为5+宝贝设计的数学软件，让孩子轻松学习加减法，赢在起跑线~难度贴...  \n",
653 |        "15979  长安通APP是一款便民生活服务软件。用户使用长安通APP，可以实现长安通卡NFC充值、查询余...  "
654 |       ]
655 |      },
656 |      "execution_count": 66,
657 |      "metadata": {},
658 |      "output_type": "execute_result"
659 |     }
660 |    ],
661 |    "source": [
662 |     "apptype_train.iloc[15945:15980]"
663 |    ]
664 |   }
665 |  ],
666 |  "metadata": {
667 |   "kernelspec": {
668 |    "display_name": "Python 3",
669 |    "language": "python",
670 |    "name": "python3"
671 |   },
672 |   "language_info": {
673 |    "codemirror_mode": {
674 |     "name": "ipython",
675 |     "version": 3
676 |    },
677 |    "file_extension": ".py",
678 |    "mimetype": "text/x-python",
679 |    "name": "python",
680 |    "nbconvert_exporter": "python",
681 |    "pygments_lexer": "ipython3",
682 |    "version": "3.6.8"
683 |   }
684 |  },
685 |  "nbformat": 4,
686 |  "nbformat_minor": 2
687 | }
688 | 


--------------------------------------------------------------------------------
/notebook/Untitled1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 13 |       "  from ._conv import register_converters as _register_converters\n",
 14 |       "Using TensorFlow backend.\n"
 15 |      ]
 16 |     },
 17 |     {
 18 |      "ename": "ImportError",
 19 |      "evalue": "Traceback (most recent call last):\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\", line 58, in <module>\n    from tensorflow.python.pywrap_tensorflow_internal import *\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 28, in <module>\n    _pywrap_tensorflow_internal = swig_import_helper()\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 24, in swig_import_helper\n    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)\n  File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 243, in load_module\n    return load_dynamic(name, filename, file)\n  File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 343, in load_dynamic\n    return _load(spec)\nImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory\n\n\nFailed to load the native TensorFlow runtime.\n\nSee https://www.tensorflow.org/install/errors\n\nfor some common reasons and solutions.  Include the entire stack trace\nabove this error message when asking for help.",
 20 |      "output_type": "error",
 21 |      "traceback": [
 22 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 23 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 24 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m   \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpywrap_tensorflow_internal\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m   \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpywrap_tensorflow_internal\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m__version__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 25 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     27\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0m_mod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m     \u001b[0m_pywrap_tensorflow_internal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mswig_import_helper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     29\u001b[0m     \u001b[0;32mdel\u001b[0m \u001b[0mswig_import_helper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 26 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\u001b[0m in \u001b[0;36mswig_import_helper\u001b[0;34m()\u001b[0m\n\u001b[1;32m     23\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m                 \u001b[0m_mod\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_module\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_pywrap_tensorflow_internal'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpathname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     25\u001b[0m             \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 27 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\u001b[0m in \u001b[0;36mload_module\u001b[0;34m(name, file, filename, details)\u001b[0m\n\u001b[1;32m    242\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mload_dynamic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    244\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mtype_\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mPKG_DIRECTORY\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 28 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\u001b[0m in \u001b[0;36mload_dynamic\u001b[0;34m(name, path, file)\u001b[0m\n\u001b[1;32m    342\u001b[0m             name=name, loader=loader, origin=path)\n\u001b[0;32m--> 343\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    344\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 29 |       "\u001b[0;31mImportError\u001b[0m: libcublas.so.9.0: cannot open shared object file: No such file or directory",
 30 |       "\nDuring handling of the above exception, another exception occurred:\n",
 31 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 32 |       "\u001b[0;32m<ipython-input-1-ee283b920485>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDense\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSequential\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCustomObjectScope\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mkeras\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mkeras\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 33 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m__future__\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mabsolute_import\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mactivations\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mapplications\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 34 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/utils/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdata_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mio_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mconv_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;31m# Globally-importable utils.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 35 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/utils/conv_utils.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmoves\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mbackend\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mK\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 36 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/backend/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     87\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0m_BACKEND\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'tensorflow'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m     \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Using TensorFlow backend.\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m     \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtensorflow_backend\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     90\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     91\u001b[0m     \u001b[0;31m# Try and load external backend.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 37 |       "\u001b[0;32m/apps/dslab/anaconda/python36new/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m__future__\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mprint_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframework\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mops\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf_ops\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmoving_averages\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 38 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[0;31m# pylint: disable=g-bad-import-order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpywrap_tensorflow\u001b[0m  \u001b[0;31m# pylint: disable=unused-import\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 39 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     47\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 49\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpywrap_tensorflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     51\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpython\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtools\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcomponent_api_helper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 40 |       "\u001b[0;32m~/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msome\u001b[0m \u001b[0mcommon\u001b[0m \u001b[0mreasons\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0msolutions\u001b[0m\u001b[0;34m.\u001b[0m  \u001b[0mInclude\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mentire\u001b[0m \u001b[0mstack\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     73\u001b[0m above this error message when asking for help.\"\"\" % traceback.format_exc()\n\u001b[0;32m---> 74\u001b[0;31m   \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[0;31m# pylint: enable=wildcard-import,g-import-not-at-top,unused-import,line-too-long\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 41 |       "\u001b[0;31mImportError\u001b[0m: Traceback (most recent call last):\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py\", line 58, in <module>\n    from tensorflow.python.pywrap_tensorflow_internal import *\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 28, in <module>\n    _pywrap_tensorflow_internal = swig_import_helper()\n  File \"/users/hdpsbp/.local/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py\", line 24, in swig_import_helper\n    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)\n  File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 243, in load_module\n    return load_dynamic(name, filename, file)\n  File \"/apps/dslab/anaconda/python36new/lib/python3.6/imp.py\", line 343, in load_dynamic\n    return _load(spec)\nImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory\n\n\nFailed to load the native TensorFlow runtime.\n\nSee https://www.tensorflow.org/install/errors\n\nfor some common reasons and solutions.  Include the entire stack trace\nabove this error message when asking for help."
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "from keras.layers import Dense\n",
 47 |     "from keras.models import load_model, Sequential\n",
 48 |     "from keras.utils import CustomObjectScope\n",
 49 |     "import tensorflow as tf\n",
 50 |     "import keras as keras\n",
 51 |     "from keras import backend\n",
 52 |     "from keras.models import Model\n",
 53 |     "from keras.optimizers import Adam\n",
 54 |     "from keras import losses \n",
 55 |     "\n",
 56 |     "def get_test_mode():\n",
 57 |     "    model = Sequential()\n",
 58 |     "    from keras.layers import Flatten\n",
 59 |     "    input = keras.Input(shape=(10,10), dtype='float', name='raw_image_left')\n",
 60 |     "    test = keras.layers.Dense(units=64, activation='relu', name='middle', )(input)\n",
 61 |     "    test = keras.layers.Flatten()(test)\n",
 62 |     "    #Comments this line will be correct, which is only  incorrect in CPU\n",
 63 |     "    test = keras.layers.Lambda(lambda x: keras.backend.sqrt(x), name='error_point')(test)\n",
 64 |     "    output = keras.layers.Dense(units=1, activation='sigmoid', name='output', )(test)\n",
 65 |     "    model = Model(input, output)\n",
 66 |     "    return model\n",
 67 |     "\n",
 68 |     "model = get_test_mode()\n",
 69 |     "model.summary()\n",
 70 |     "\n",
 71 |     "opt = Adam()\n",
 72 |     "model.compile(optimizer=opt, loss=losses.binary_crossentropy )\n",
 73 |     "\n",
 74 |     "import numpy as np\n",
 75 |     "X = np.random.randint(255, size=(100, 10, 10))\n",
 76 |     "y = np.random.randint(2, size=(100))\n",
 77 |     " \n",
 78 |     "    \n",
 79 |     "model.fit(X, y, epochs=3, batch_size=10)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 2,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "ai-prd-05\r\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "!hostname"
 97 |    ]
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "Python [conda env:python36new]",
103 |    "language": "python",
104 |    "name": "conda-env-python36new-py"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.6.7"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 2
121 | }
122 | 


--------------------------------------------------------------------------------
/notebook/Untitled2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": []
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 2,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "\n"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": 5,
22 |    "metadata": {},
23 |    "outputs": [
24 |     {
25 |      "name": "stdout",
26 |      "output_type": "stream",
27 |      "text": [
28 |       "./cache/get_oof_version=v75,4,0=.pickle\n",
29 |       "./cache/get_oof_version=v36,4,0=.pickle\n",
30 |       "./cache/get_oof_version=v72,4,0=.pickle\n",
31 |       "./cache/get_oof_version=v74,4,0=.pickle\n",
32 |       "./cache/get_oof_version=v43,4,0=.pickle\n",
33 |       "./cache/get_oof_version=v73,4,0=.pickle\n"
34 |      ]
35 |     }
36 |    ],
37 |    "source": []
38 |   },
39 |   {
40 |    "cell_type": "code",
41 |    "execution_count": 7,
42 |    "metadata": {},
43 |    "outputs": [
44 |     {
45 |      "data": {
46 |       "text/plain": [
47 |        "(130000, 127)"
48 |       ]
49 |      },
50 |      "execution_count": 7,
51 |      "metadata": {},
52 |      "output_type": "execute_result"
53 |     }
54 |    ],
55 |    "source": [
56 |     "pd.read_hdf('./cache/get_oof_version=v75,4,0=.h5').shape"
57 |    ]
58 |   }
59 |  ],
60 |  "metadata": {
61 |   "kernelspec": {
62 |    "display_name": "Python [default]",
63 |    "language": "python",
64 |    "name": "python3"
65 |   },
66 |   "language_info": {
67 |    "codemirror_mode": {
68 |     "name": "ipython",
69 |     "version": 3
70 |    },
71 |    "file_extension": ".py",
72 |    "mimetype": "text/x-python",
73 |    "name": "python",
74 |    "nbconvert_exporter": "python",
75 |    "pygments_lexer": "ipython3",
76 |    "version": "3.6.7"
77 |   }
78 |  },
79 |  "nbformat": 4,
80 |  "nbformat_minor": 2
81 | }
82 | 


--------------------------------------------------------------------------------
/notebook/train_v2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "_uuid": "887cc6be0765929d5e382a830efdb902bd9ce99b"
  7 |    },
  8 |    "source": [
  9 |     "# Baidu Emotion.\n",
 10 |     "\n",
 11 |     "### Let's start exploring the dataset"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stderr",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "2019-01-02 20:14:52,472 util_log.py[61] DEBUG Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "#Adjust the working folder\n",
 29 |     "import sys\n",
 30 |     "import os\n",
 31 |     "#print(globals())\n",
 32 |     "file_folder = globals()['_dh'][0]\n",
 33 |     "wk_dir = os.path.dirname(file_folder)\n",
 34 |     "os.chdir(wk_dir)\n",
 35 |     "\n",
 36 |     "import pandas as pd\n",
 37 |     "\n",
 38 |     "from code_felix.core.config import *\n",
 39 |     "from code_felix.core.feature import *\n",
 40 |     "from file_cache.utils.util_log import *\n",
 41 |     "\n",
 42 |     "import matplotlib.pyplot as plt\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签\n",
 46 |     "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {
 53 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
 54 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import numpy as np # linear algebra\n",
 60 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "#Loading the dataset\n",
 64 |     "dataset = pd.read_csv(train_file, encoding='gb18030', delimiter='\\t', header=None)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 72 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 73 |    },
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/html": [
 78 |        "<div>\n",
 79 |        "<style scoped>\n",
 80 |        "    .dataframe tbody tr th:only-of-type {\n",
 81 |        "        vertical-align: middle;\n",
 82 |        "    }\n",
 83 |        "\n",
 84 |        "    .dataframe tbody tr th {\n",
 85 |        "        vertical-align: top;\n",
 86 |        "    }\n",
 87 |        "\n",
 88 |        "    .dataframe thead th {\n",
 89 |        "        text-align: right;\n",
 90 |        "    }\n",
 91 |        "</style>\n",
 92 |        "<table border=\"1\" class=\"dataframe\">\n",
 93 |        "  <thead>\n",
 94 |        "    <tr style=\"text-align: right;\">\n",
 95 |        "      <th></th>\n",
 96 |        "      <th>0</th>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <th>2</th>\n",
 99 |        "      <th>3</th>\n",
100 |        "    </tr>\n",
101 |        "  </thead>\n",
102 |        "  <tbody>\n",
103 |        "    <tr>\n",
104 |        "      <th>0</th>\n",
105 |        "      <td>1</td>\n",
106 |        "      <td>食品餐饮</td>\n",
107 |        "      <td>买这套系统本来是用来做我们公司的公众号第三方平台代运营的，没想到还有app，而且每个都很方便...</td>\n",
108 |        "      <td>2</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>1</th>\n",
112 |        "      <td>2</td>\n",
113 |        "      <td>食品餐饮</td>\n",
114 |        "      <td>烤鸭还是不错的，别的菜没什么特殊的</td>\n",
115 |        "      <td>1</td>\n",
116 |        "    </tr>\n",
117 |        "    <tr>\n",
118 |        "      <th>2</th>\n",
119 |        "      <td>3</td>\n",
120 |        "      <td>食品餐饮</td>\n",
121 |        "      <td>使用说明看不懂！不会用，很多操作没详细标明！</td>\n",
122 |        "      <td>0</td>\n",
123 |        "    </tr>\n",
124 |        "  </tbody>\n",
125 |        "</table>\n",
126 |        "</div>"
127 |       ],
128 |       "text/plain": [
129 |        "   0     1                                                  2  3\n",
130 |        "0  1  食品餐饮  买这套系统本来是用来做我们公司的公众号第三方平台代运营的，没想到还有app，而且每个都很方便...  2\n",
131 |        "1  2  食品餐饮                                  烤鸭还是不错的，别的菜没什么特殊的  1\n",
132 |        "2  3  食品餐饮                             使用说明看不懂！不会用，很多操作没详细标明！  0"
133 |       ]
134 |      },
135 |      "execution_count": 3,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# Prin some samples\n",
142 |     "dataset.head(3)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {
148 |     "_uuid": "b2e3ea8267e3bac72a30ce7803413c684bc8b9a4"
149 |    },
150 |    "source": [
151 |     "## Preparing data for model training\n",
152 |     "### Tokenization\n",
153 |     "Since the data is already tokenized and lowercased, we just need to split the words\n"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 4,
159 |    "metadata": {
160 |     "_uuid": "cb739f05cfb4b5d74702cdef1ea5a130c0d90132"
161 |    },
162 |    "outputs": [
163 |     {
164 |      "name": "stderr",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "Building prefix dict from the default dictionary ...\n",
168 |       "2019-01-02 20:14:52,758 __init__.py[111] DEBUG Building prefix dict from the default dictionary ...\n",
169 |       "Loading model from cache /var/folders/d2/vq91lnt11m13m84s18dzdm8r0000gn/T/jieba.cache\n",
170 |       "2019-01-02 20:14:52,766 __init__.py[131] DEBUG Loading model from cache /var/folders/d2/vq91lnt11m13m84s18dzdm8r0000gn/T/jieba.cache\n",
171 |       "Loading model cost 0.715 seconds.\n",
172 |       "2019-01-02 20:14:53,479 __init__.py[163] DEBUG Loading model cost 0.715 seconds.\n",
173 |       "Prefix dict has been built succesfully.\n",
174 |       "2019-01-02 20:14:53,483 __init__.py[164] DEBUG Prefix dict has been built succesfully.\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "input_sentences = [list(jieba.cut(str(text), cut_all=False))  for text in dataset.iloc[:, 2].values.tolist()]\n",
180 |     "labels = dataset.iloc[:, 3].values.tolist()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 5,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "[2, 1, 0, 0, 1, 2, 2, 2, 2, 2]"
192 |       ]
193 |      },
194 |      "execution_count": 5,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "labels[:10]"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {
206 |     "_uuid": "1a7c2e03d7e839b2872785157153e0bfef82b0bd"
207 |    },
208 |    "source": [
209 |     "### Creating Vocabulary (word index)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 14,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "name": "stderr",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "2019-01-02 20:16:43,159 util_log.py[41] INFO get_word_id_vec begin with(1 paras) :['02'], []\n",
222 |       "2019-01-02 20:16:43,175 cache.py[29] DEBUG try to read cache from file:./cache/get_word_id_vec=02=.h5, (h5, key:['/df_0'])\n",
223 |       "2019-01-02 20:16:43,221 util_log.py[49] INFO get_word_id_vec cost    0.06 sec:(1 paras)(['02'], []), return:DataFrame, end \n",
224 |       "2019-01-02 20:16:47,027 <ipython-input-14-102863bb139b>[3] DEBUG Word length:42014\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "word_id_vec =  get_word_id_vec('02')\n",
230 |     "word2id = dict( word_id_vec.apply(lambda row: (row['word'], row['id']), axis=1).values )\n",
231 |     "logger.debug(f'Word length:{len(word2id)}')\n",
232 |     "\n",
233 |     "\n",
234 |     "embedding_weights = word_id_vec.iloc[:, -vector_size:].fillna(0).values\n",
235 |     "#embedding_weights[10]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 7,
241 |    "metadata": {
242 |     "_uuid": "f60be75ae0d5cbfc36eeba0243407b66741bb42e"
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stderr",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "2019-01-02 20:14:58,137 <ipython-input-7-ae333458653d>[18] DEBUG max_words=38\n",
250 |       "2019-01-02 20:14:58,141 <ipython-input-7-ae333458653d>[18] DEBUG max_words=73\n",
251 |       "2019-01-02 20:14:58,143 <ipython-input-7-ae333458653d>[18] DEBUG max_words=77\n",
252 |       "2019-01-02 20:14:58,146 <ipython-input-7-ae333458653d>[18] DEBUG max_words=88\n",
253 |       "2019-01-02 20:14:58,148 <ipython-input-7-ae333458653d>[18] DEBUG max_words=93\n",
254 |       "2019-01-02 20:14:58,150 <ipython-input-7-ae333458653d>[18] DEBUG max_words=110\n",
255 |       "2019-01-02 20:14:58,152 <ipython-input-7-ae333458653d>[18] DEBUG max_words=151\n",
256 |       "2019-01-02 20:14:58,154 <ipython-input-7-ae333458653d>[18] DEBUG max_words=171\n",
257 |       "2019-01-02 20:14:58,156 <ipython-input-7-ae333458653d>[18] DEBUG max_words=177\n",
258 |       "2019-01-02 20:14:58,158 <ipython-input-7-ae333458653d>[18] DEBUG max_words=196\n"
259 |      ]
260 |     },
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "{0: 0, 1: 1, 2: 2}"
265 |       ]
266 |      },
267 |      "execution_count": 7,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "# Initialize word2id and label2id dictionaries that will be used to encode words and labels\n",
274 |     "\n",
275 |     "label2id = dict()\n",
276 |     "\n",
277 |     "max_words = 0 # maximum number of words in a sentence\n",
278 |     "\n",
279 |     "# Construction of word2id dict\n",
280 |     "for sentence in input_sentences:\n",
281 |     "#     for word in sentence:\n",
282 |     "#         # Add words to word2id dict if not exist\n",
283 |     "#         if word not in word2id:\n",
284 |     "#             word2id[word] = len(word2id)\n",
285 |     "#     # If length of the sentence is greater than max_words, update max_words\n",
286 |     "#     sentence = list(sentence)\n",
287 |     "#     logger.debug(f'{len(sentence)} : {sentence}')\n",
288 |     "    if len(sentence) > max_words:\n",
289 |     "        max_words = len(sentence)\n",
290 |     "        logger.debug(f'max_words={max_words}')\n",
291 |     "    \n",
292 |     "# Construction of label2id and id2label dicts\n",
293 |     "label2id = {l: i for i, l in enumerate(set(labels))}\n",
294 |     "id2label = {v: k for k, v in label2id.items()}\n",
295 |     "id2label"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {
301 |     "_uuid": "d984e58ffd25530ac4c05ce623d9237a35cf903d"
302 |    },
303 |    "source": [
304 |     "### Encoding samples with corresponing integer values"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 8,
310 |    "metadata": {
311 |     "_uuid": "378ef884a6ebb19b02a70082bc6c854c51780af3"
312 |    },
313 |    "outputs": [
314 |     {
315 |      "name": "stderr",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "/Users/lali2/dev/python/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
319 |       "  from ._conv import register_converters as _register_converters\n",
320 |       "Using TensorFlow backend.\n",
321 |       "/Users/lali2/dev/python/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
322 |       "  return f(*args, **kwds)\n"
323 |      ]
324 |     },
325 |     {
326 |      "name": "stdout",
327 |      "output_type": "stream",
328 |      "text": [
329 |       "Shape of X: (2000, 196)\n",
330 |       "Shape of Y: (2000, 3)\n"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "import keras\n",
336 |     "\n",
337 |     "# Encode input words and labels\n",
338 |     "X = [[word2id[word] for word in sentence] for sentence in input_sentences]\n",
339 |     "Y = [label2id[label] for label in labels]\n",
340 |     "\n",
341 |     "# Apply Padding to X\n",
342 |     "from keras.preprocessing.sequence import pad_sequences\n",
343 |     "X = pad_sequences(X, max_words)\n",
344 |     "\n",
345 |     "# Convert Y to numpy array\n",
346 |     "Y = keras.utils.to_categorical(Y, num_classes=len(label2id))\n",
347 |     "\n",
348 |     "# Print shapes\n",
349 |     "print(\"Shape of X: {}\".format(X.shape))\n",
350 |     "print(\"Shape of Y: {}\".format(Y.shape))\n"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {
356 |     "_uuid": "4bccaa5b813414ad7929522d4d0f74dbb9c4c5af"
357 |    },
358 |    "source": [
359 |     "## Build LSTM model with attention "
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 9,
365 |    "metadata": {
366 |     "collapsed": true
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "keras.layers.Embedding?"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 21,
376 |    "metadata": {
377 |     "_uuid": "4c1b5fc7613a0fe5a8067135e2de07e0765f1b78"
378 |    },
379 |    "outputs": [
380 |     {
381 |      "name": "stdout",
382 |      "output_type": "stream",
383 |      "text": [
384 |       "__________________________________________________________________________________________________\n",
385 |       "Layer (type)                    Output Shape         Param #     Connected to                     \n",
386 |       "==================================================================================================\n",
387 |       "input_7 (InputLayer)            (None, 196)          0                                            \n",
388 |       "__________________________________________________________________________________________________\n",
389 |       "embedding_7 (Embedding)         (None, 196, 200)     8402800     input_7[0][0]                    \n",
390 |       "__________________________________________________________________________________________________\n",
391 |       "dropout_3 (Dropout)             (None, 196, 200)     0           embedding_7[0][0]                \n",
392 |       "__________________________________________________________________________________________________\n",
393 |       "bidirectional_2 (Bidirectional) (None, 196, 200)     240800      dropout_3[0][0]                  \n",
394 |       "__________________________________________________________________________________________________\n",
395 |       "dropout_4 (Dropout)             (None, 196, 200)     0           bidirectional_2[0][0]            \n",
396 |       "__________________________________________________________________________________________________\n",
397 |       "time_distributed_2 (TimeDistrib (None, 196, 1)       201         dropout_4[0][0]                  \n",
398 |       "__________________________________________________________________________________________________\n",
399 |       "reshape_2 (Reshape)             (None, 196)          0           time_distributed_2[0][0]         \n",
400 |       "__________________________________________________________________________________________________\n",
401 |       "attention_vec (Activation)      (None, 196)          0           reshape_2[0][0]                  \n",
402 |       "__________________________________________________________________________________________________\n",
403 |       "dot_2 (Dot)                     (None, 200)          0           dropout_4[0][0]                  \n",
404 |       "                                                                 attention_vec[0][0]              \n",
405 |       "__________________________________________________________________________________________________\n",
406 |       "dense_5 (Dense)                 (None, 100)          20100       dot_2[0][0]                      \n",
407 |       "__________________________________________________________________________________________________\n",
408 |       "dense_6 (Dense)                 (None, 3)            303         dense_5[0][0]                    \n",
409 |       "==================================================================================================\n",
410 |       "Total params: 8,664,204\n",
411 |       "Trainable params: 8,664,204\n",
412 |       "Non-trainable params: 0\n",
413 |       "__________________________________________________________________________________________________\n"
414 |      ]
415 |     }
416 |    ],
417 |    "source": [
418 |     "embedding_dim = 100 # The dimension of word embeddings\n",
419 |     "\n",
420 |     "# Define input tensor\n",
421 |     "sequence_input = keras.Input(shape=(max_words,), dtype='int32')\n",
422 |     "\n",
423 |     "# Word embedding layer\n",
424 |     "embedded_inputs =keras.layers.Embedding(len(word2id) ,\n",
425 |     "                                        vector_size ,\n",
426 |     "                                        input_length=max_words ,\n",
427 |     "                                        weights = [embedding_weights] ,\n",
428 |     "                                       )(sequence_input)\n",
429 |     "\n",
430 |     "# Apply dropout to prevent overfitting\n",
431 |     "embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)\n",
432 |     "\n",
433 |     "# Apply Bidirectional LSTM over embedded inputs\n",
434 |     "lstm_outs = keras.layers.wrappers.Bidirectional(\n",
435 |     "    keras.layers.LSTM(embedding_dim, return_sequences=True)\n",
436 |     ")(embedded_inputs)\n",
437 |     "\n",
438 |     "# Apply dropout to LSTM outputs to prevent overfitting\n",
439 |     "lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)\n",
440 |     "\n",
441 |     "# Attention Mechanism - Generate attention vectors\n",
442 |     "input_dim = int(lstm_outs.shape[2])\n",
443 |     "permuted_inputs = keras.layers.Permute((2, 1))(lstm_outs)\n",
444 |     "attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)\n",
445 |     "attention_vector = keras.layers.Reshape((max_words,))(attention_vector)\n",
446 |     "attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)\n",
447 |     "attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])\n",
448 |     "\n",
449 |     "# Last layer: fully connected with softmax activation\n",
450 |     "fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)\n",
451 |     "output = keras.layers.Dense(len(label2id), activation='softmax')(fc)\n",
452 |     "\n",
453 |     "# Finally building model\n",
454 |     "model = keras.Model(inputs=[sequence_input], outputs=output)\n",
455 |     "model.compile(loss=\"categorical_crossentropy\", metrics=[\"accuracy\"], optimizer='adam')\n",
456 |     "\n",
457 |     "# Print model summary\n",
458 |     "model.summary()\n",
459 |     "\n",
460 |     "\n",
461 |     "\n"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {
467 |     "_uuid": "ad67135dcd65940d864521309066ff9fb5b7c9a2"
468 |    },
469 |    "source": [
470 |     "## Training the model"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 22,
476 |    "metadata": {
477 |     "_uuid": "d9441f027a63ad3c8b288c6823e073b142c33b34"
478 |    },
479 |    "outputs": [
480 |     {
481 |      "name": "stdout",
482 |      "output_type": "stream",
483 |      "text": [
484 |       "Train on 1800 samples, validate on 200 samples\n",
485 |       "Epoch 1/2\n",
486 |       "1800/1800 [==============================] - 18s 10ms/step - loss: 0.8834 - acc: 0.6761 - val_loss: 0.8265 - val_acc: 0.6950\n",
487 |       "Epoch 2/2\n",
488 |       "1800/1800 [==============================] - 16s 9ms/step - loss: 0.8213 - acc: 0.6778 - val_loss: 0.6513 - val_acc: 0.7150\n"
489 |      ]
490 |     },
491 |     {
492 |      "data": {
493 |       "text/plain": [
494 |        "<keras.callbacks.History at 0x12b236a58>"
495 |       ]
496 |      },
497 |      "execution_count": 22,
498 |      "metadata": {},
499 |      "output_type": "execute_result"
500 |     }
501 |    ],
502 |    "source": [
503 |     "# Train model 10 iterations\n",
504 |     "model.fit(X, Y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "metadata": {
510 |     "_uuid": "b37aca89d92439a9777bb7634dcd12aef2162771"
511 |    },
512 |    "source": [
513 |     "The accuracy on validation data about 93%. Very good result for a classification task with six-classes.\n",
514 |     "The performance can be further improved by training the model a few more iteration.\n",
515 |     "\n",
516 |     "**Let's look closer to model predictions and attentions**"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {
523 |     "_uuid": "6a5e94835a8aa88b8609a95e80add37fc1ffd4d7",
524 |     "collapsed": true
525 |    },
526 |    "outputs": [],
527 |    "source": [
528 |     "# Re-create the model to get attention vectors as well as label prediction\n",
529 |     "model_with_attentions = keras.Model(inputs=model.input,\n",
530 |     "                                    outputs=[model.output, \n",
531 |     "                                             model.get_layer('attention_vec').output])"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {
538 |     "_uuid": "f7f1a8770b09a221787e38376392ba977172c215",
539 |     "collapsed": true,
540 |     "scrolled": true
541 |    },
542 |    "outputs": [],
543 |    "source": [
544 |     "import random\n",
545 |     "import math\n",
546 |     "\n",
547 |     "# Select random samples to illustrate\n",
548 |     "sample_text = random.choice(dataset[\"text\"].values.tolist())\n",
549 |     "\n",
550 |     "# Encode samples\n",
551 |     "tokenized_sample = sample_text.split(\" \")\n",
552 |     "encoded_samples = [[word2id[word] for word in tokenized_sample]]\n",
553 |     "\n",
554 |     "# Padding\n",
555 |     "encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)\n",
556 |     "\n",
557 |     "# Make predictions\n",
558 |     "label_probs, attentions = model_with_attentions.predict(encoded_samples)\n",
559 |     "label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(),label_probs[0])}\n",
560 |     "\n",
561 |     "# Get word attentions using attenion vector\n",
562 |     "token_attention_dic = {}\n",
563 |     "max_score = 0.0\n",
564 |     "min_score = 0.0\n",
565 |     "for token, attention_score in zip(tokenized_sample, attentions[0][-len(tokenized_sample):]):\n",
566 |     "    token_attention_dic[token] = math.sqrt(attention_score)\n",
567 |     "\n",
568 |     "\n",
569 |     "# VISUALIZATION\n",
570 |     "import matplotlib.pyplot as plt; plt.rcdefaults()\n",
571 |     "import numpy as np\n",
572 |     "import matplotlib.pyplot as plt\n",
573 |     "from IPython.core.display import display, HTML\n",
574 |     "\n",
575 |     "def rgb_to_hex(rgb):\n",
576 |     "    return '#%02x%02x%02x' % rgb\n",
577 |     "    \n",
578 |     "def attention2color(attention_score):\n",
579 |     "    r = 255 - int(attention_score * 255)\n",
580 |     "    color = rgb_to_hex((255, r, r))\n",
581 |     "    return str(color)\n",
582 |     "    \n",
583 |     "# Build HTML String to viualize attentions\n",
584 |     "html_text = \"<hr><p style='font-size: large'><b>Text:  </b>\"\n",
585 |     "for token, attention in token_attention_dic.items():\n",
586 |     "    html_text += \"<span style='background-color:{};'>{} <span> \".format(attention2color(attention),\n",
587 |     "                                                                        token)\n",
588 |     "html_text += \"</p>\"\n",
589 |     "# Display text enriched with attention scores \n",
590 |     "display(HTML(html_text))\n",
591 |     "\n",
592 |     "# PLOT EMOTION SCORES\n",
593 |     "emotions = [label for label, _ in label_probs.items()]\n",
594 |     "scores = [score for _, score in label_probs.items()]\n",
595 |     "plt.figure(figsize=(5,2))\n",
596 |     "plt.bar(np.arange(len(emotions)), scores, align='center', alpha=0.5, color=['black', 'red', 'green', 'blue', 'cyan', \"purple\"])\n",
597 |     "plt.xticks(np.arange(len(emotions)), emotions)\n",
598 |     "plt.ylabel('Scores')\n",
599 |     "plt.show()\n",
600 |     "\n"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {
606 |     "_uuid": "dd58f4f2b92b103765af428baee13a53d80eb4e9"
607 |    },
608 |    "source": [
609 |     "**We have used an attention mechanism with an LSTM network to recognize emotions in given text.\n",
610 |     "We show that attention mechanism can be useful for classification tasks as well as sequence labeling tasks.\n",
611 |     "We have illustrated the attentions in order to make model predictions interpretable and look fancy.\n",
612 |     "Enjoy attentions mechanism in different applications...**\n",
613 |     "\n",
614 |     "*All feedbacks are welcome.*\n",
615 |     "\n"
616 |    ]
617 |   }
618 |  ],
619 |  "metadata": {
620 |   "kernelspec": {
621 |    "display_name": "Python 3",
622 |    "language": "python",
623 |    "name": "python3"
624 |   },
625 |   "language_info": {
626 |    "codemirror_mode": {
627 |     "name": "ipython",
628 |     "version": 3
629 |    },
630 |    "file_extension": ".py",
631 |    "mimetype": "text/x-python",
632 |    "name": "python",
633 |    "nbconvert_exporter": "python",
634 |    "pygments_lexer": "ipython3",
635 |    "version": "3.6.3"
636 |   }
637 |  },
638 |  "nbformat": 4,
639 |  "nbformat_minor": 1
640 | }
641 | 


--------------------------------------------------------------------------------
/notebook/word_analysis_local.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "2019-07-04 22:42:21,041 util_log.py[128] INFO Start the program at:LALI2-M-G0MD, 127.0.0.1, with:Load module\n",
 13 |       "2019-07-04 22:42:21,045 util_pandas.py[19] WARNING \"No such keys(s): 'display.height'\"\n"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "name": "stdout",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "yes\n",
 21 |       "/Users/lali2/Documents/workspace_py/xf_tag\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import sys\n",
 27 |     "import os\n",
 28 |     "os.chdir('../')\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "import pandas as pd\n",
 32 |     "import numpy as np\n",
 33 |     "\n",
 34 |     "from bokeh.palettes import Category10\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "from tqdm import tqdm\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "from file_cache.utils.util_pandas import *\n",
 41 |     "from file_cache.cache import file_cache\n",
 42 |     "from functools import lru_cache\n",
 43 |     "from glob import glob\n",
 44 |     "\n",
 45 |     "%matplotlib inline\n",
 46 |     "from core.conf import *\n",
 47 |     "!pwd"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 62,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "(94354, 2)\n"
 60 |      ]
 61 |     },
 62 |     {
 63 |      "data": {
 64 |       "text/html": [
 65 |        "<div>\n",
 66 |        "<style scoped>\n",
 67 |        "    .dataframe tbody tr th:only-of-type {\n",
 68 |        "        vertical-align: middle;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe tbody tr th {\n",
 72 |        "        vertical-align: top;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe thead th {\n",
 76 |        "        text-align: right;\n",
 77 |        "    }\n",
 78 |        "</style>\n",
 79 |        "<table border=\"1\" class=\"dataframe\">\n",
 80 |        "  <thead>\n",
 81 |        "    <tr style=\"text-align: right;\">\n",
 82 |        "      <th></th>\n",
 83 |        "      <th>app_id</th>\n",
 84 |        "      <th>app_des</th>\n",
 85 |        "    </tr>\n",
 86 |        "  </thead>\n",
 87 |        "  <tbody>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>0</th>\n",
 90 |        "      <td>BB29DA6F8167CFC99E0853741C4EB17B</td>\n",
 91 |        "      <td>注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏，你用你自己的音乐比赛...</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>1</th>\n",
 95 |        "      <td>BB2A78EA7AD4945EAF6E38997F6139A3</td>\n",
 96 |        "      <td>定位试衣到家是一款基于地理位置，提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>2</th>\n",
100 |        "      <td>BB2B1604CFA079C289FECF927DFBCE89</td>\n",
101 |        "      <td>想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质，邀您...</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>3</th>\n",
105 |        "      <td>BB2C7BD0B0623644183DAD08A89E1D90</td>\n",
106 |        "      <td>闽通宝手机客户端是基于移动互联网的，以公众出行服务为基础，贯彻绿色出行，低碳生活的理念，为出...</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>4</th>\n",
110 |        "      <td>BB2E1A8F56158E483D7461E930E6332F</td>\n",
111 |        "      <td>风靡全球的DIY照片桌面，干净、流畅，启动提速100，瞬间提升手机性能；更是一亿用户的共同选...</td>\n",
112 |        "    </tr>\n",
113 |        "  </tbody>\n",
114 |        "</table>\n",
115 |        "</div>"
116 |       ],
117 |       "text/plain": [
118 |        "                             app_id  \\\n",
119 |        "0  BB29DA6F8167CFC99E0853741C4EB17B   \n",
120 |        "1  BB2A78EA7AD4945EAF6E38997F6139A3   \n",
121 |        "2  BB2B1604CFA079C289FECF927DFBCE89   \n",
122 |        "3  BB2C7BD0B0623644183DAD08A89E1D90   \n",
123 |        "4  BB2E1A8F56158E483D7461E930E6332F   \n",
124 |        "\n",
125 |        "                                             app_des  \n",
126 |        "0  注意]游戏需要在设备上自己的歌曲注意]音乐赛车是一个音乐改编的赛车游戏，你用你自己的音乐比赛...  \n",
127 |        "1  定位试衣到家是一款基于地理位置，提供试衣到家专属购物体验的互联网平台。购物流程客户在试衣到家...  \n",
128 |        "2  想念一个人,就说出来。记得要下载安卓锁屏才可正常显示锁屏效果哦~更新内容更稳定、更优质，邀您...  \n",
129 |        "3  闽通宝手机客户端是基于移动互联网的，以公众出行服务为基础，贯彻绿色出行，低碳生活的理念，为出...  \n",
130 |        "4  风靡全球的DIY照片桌面，干净、流畅，启动提速100，瞬间提升手机性能；更是一亿用户的共同选...  "
131 |       ]
132 |      },
133 |      "execution_count": 62,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "app_desc = pd.read_csv(f'{input_dir}/app_desc.dat', delimiter='\\t', header=None,  names =['app_id', 'app_des'])\n",
140 |     "print(app_desc.shape)\n",
141 |     "app_desc.head()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 63,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "(152, 2)\n"
154 |      ]
155 |     },
156 |     {
157 |      "data": {
158 |       "text/html": [
159 |        "<div>\n",
160 |        "<style scoped>\n",
161 |        "    .dataframe tbody tr th:only-of-type {\n",
162 |        "        vertical-align: middle;\n",
163 |        "    }\n",
164 |        "\n",
165 |        "    .dataframe tbody tr th {\n",
166 |        "        vertical-align: top;\n",
167 |        "    }\n",
168 |        "\n",
169 |        "    .dataframe thead th {\n",
170 |        "        text-align: right;\n",
171 |        "    }\n",
172 |        "</style>\n",
173 |        "<table border=\"1\" class=\"dataframe\">\n",
174 |        "  <thead>\n",
175 |        "    <tr style=\"text-align: right;\">\n",
176 |        "      <th></th>\n",
177 |        "      <th>type_id</th>\n",
178 |        "      <th>type_name</th>\n",
179 |        "    </tr>\n",
180 |        "  </thead>\n",
181 |        "  <tbody>\n",
182 |        "    <tr>\n",
183 |        "      <th>0</th>\n",
184 |        "      <td>1401</td>\n",
185 |        "      <td>便捷生活</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>1</th>\n",
189 |        "      <td>1402</td>\n",
190 |        "      <td>游戏</td>\n",
191 |        "    </tr>\n",
192 |        "    <tr>\n",
193 |        "      <th>2</th>\n",
194 |        "      <td>1403</td>\n",
195 |        "      <td>通讯社交</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>3</th>\n",
199 |        "      <td>1404</td>\n",
200 |        "      <td>阅读</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>4</th>\n",
204 |        "      <td>1405</td>\n",
205 |        "      <td>工作求职</td>\n",
206 |        "    </tr>\n",
207 |        "  </tbody>\n",
208 |        "</table>\n",
209 |        "</div>"
210 |       ],
211 |       "text/plain": [
212 |        "   type_id type_name\n",
213 |        "0     1401      便捷生活\n",
214 |        "1     1402        游戏\n",
215 |        "2     1403      通讯社交\n",
216 |        "3     1404        阅读\n",
217 |        "4     1405      工作求职"
218 |       ]
219 |      },
220 |      "execution_count": 63,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "app_type = pd.read_csv(f'{input_dir}/apptype_id_name.txt', delimiter='\\t', names =['type_id', 'type_name'] )\n",
227 |     "print(app_type.shape)\n",
228 |     "app_type.head()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 65,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "(30000, 3)\n"
241 |      ]
242 |     },
243 |     {
244 |      "data": {
245 |       "text/html": [
246 |        "<div>\n",
247 |        "<style scoped>\n",
248 |        "    .dataframe tbody tr th:only-of-type {\n",
249 |        "        vertical-align: middle;\n",
250 |        "    }\n",
251 |        "\n",
252 |        "    .dataframe tbody tr th {\n",
253 |        "        vertical-align: top;\n",
254 |        "    }\n",
255 |        "\n",
256 |        "    .dataframe thead th {\n",
257 |        "        text-align: right;\n",
258 |        "    }\n",
259 |        "</style>\n",
260 |        "<table border=\"1\" class=\"dataframe\">\n",
261 |        "  <thead>\n",
262 |        "    <tr style=\"text-align: right;\">\n",
263 |        "      <th></th>\n",
264 |        "      <th>app_id</th>\n",
265 |        "      <th>type_id</th>\n",
266 |        "      <th>app_des</th>\n",
267 |        "    </tr>\n",
268 |        "  </thead>\n",
269 |        "  <tbody>\n",
270 |        "    <tr>\n",
271 |        "      <th>0</th>\n",
272 |        "      <td>00000777CE5B5AA5C1AC94DB8EABE0AC</td>\n",
273 |        "      <td>140203</td>\n",
274 |        "      <td>《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>1</th>\n",
278 |        "      <td>0000DEC36E15C27DBFC64AB8208C4B37</td>\n",
279 |        "      <td>140206</td>\n",
280 |        "      <td>更稳定、更优质，邀您一起。</td>\n",
281 |        "    </tr>\n",
282 |        "    <tr>\n",
283 |        "      <th>2</th>\n",
284 |        "      <td>0001791406307B1D1CE2BC64A830B7C7</td>\n",
285 |        "      <td>142106</td>\n",
286 |        "      <td>《小钱袋》是一款免费网络版记帐软件，适用于个人记帐、家庭记帐、团队记帐，全程帮您安全记录您财...</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>3</th>\n",
290 |        "      <td>0002F14825B9CA01653325EEFD69D790</td>\n",
291 |        "      <td>142701</td>\n",
292 |        "      <td>领先的周易服务平台高人汇，汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...</td>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "      <th>4</th>\n",
296 |        "      <td>000419D79365331F89399E5F38A91B05</td>\n",
297 |        "      <td>140901</td>\n",
298 |        "      <td>平行空间是一款极简、免费的黑科技双开助手；您可以在平行空间双开微信微博、陌陌、映客、yy等应...</td>\n",
299 |        "    </tr>\n",
300 |        "  </tbody>\n",
301 |        "</table>\n",
302 |        "</div>"
303 |       ],
304 |       "text/plain": [
305 |        "                             app_id type_id  \\\n",
306 |        "0  00000777CE5B5AA5C1AC94DB8EABE0AC  140203   \n",
307 |        "1  0000DEC36E15C27DBFC64AB8208C4B37  140206   \n",
308 |        "2  0001791406307B1D1CE2BC64A830B7C7  142106   \n",
309 |        "3  0002F14825B9CA01653325EEFD69D790  142701   \n",
310 |        "4  000419D79365331F89399E5F38A91B05  140901   \n",
311 |        "\n",
312 |        "                                             app_des  \n",
313 |        "0  《游戏王座》使用说明书成分由怪兽卡、魔法卡、陷阱卡合计数千张卡牌以及刺激性、耐久性玩法组成。...  \n",
314 |        "1                                      更稳定、更优质，邀您一起。  \n",
315 |        "2  《小钱袋》是一款免费网络版记帐软件，适用于个人记帐、家庭记帐、团队记帐，全程帮您安全记录您财...  \n",
316 |        "3  领先的周易服务平台高人汇，汇聚算命大师、风水大师、占卜大师、手相大师、起名大师、算命先生、面...  \n",
317 |        "4  平行空间是一款极简、免费的黑科技双开助手；您可以在平行空间双开微信微博、陌陌、映客、yy等应...  "
318 |       ]
319 |      },
320 |      "execution_count": 65,
321 |      "metadata": {},
322 |      "output_type": "execute_result"
323 |     }
324 |    ],
325 |    "source": [
326 |     "import csv\n",
327 |     "apptype_train  = pd.read_csv(f'{input_dir}/apptype_train.dat', sep='\\t', \n",
328 |     "                             names =['app_id', 'type_id', 'app_des'] , \n",
329 |     "                             quoting=3\n",
330 |     "                            )\n",
331 |     "print(apptype_train.shape)\n",
332 |     "apptype_train.head()\n",
333 |     "#apptype_train.iloc[2,2]"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 66,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "text/html": [
344 |        "<div>\n",
345 |        "<style scoped>\n",
346 |        "    .dataframe tbody tr th:only-of-type {\n",
347 |        "        vertical-align: middle;\n",
348 |        "    }\n",
349 |        "\n",
350 |        "    .dataframe tbody tr th {\n",
351 |        "        vertical-align: top;\n",
352 |        "    }\n",
353 |        "\n",
354 |        "    .dataframe thead th {\n",
355 |        "        text-align: right;\n",
356 |        "    }\n",
357 |        "</style>\n",
358 |        "<table border=\"1\" class=\"dataframe\">\n",
359 |        "  <thead>\n",
360 |        "    <tr style=\"text-align: right;\">\n",
361 |        "      <th></th>\n",
362 |        "      <th>app_id</th>\n",
363 |        "      <th>type_id</th>\n",
364 |        "      <th>app_des</th>\n",
365 |        "    </tr>\n",
366 |        "  </thead>\n",
367 |        "  <tbody>\n",
368 |        "    <tr>\n",
369 |        "      <th>15945</th>\n",
370 |        "      <td>63959834D8FB9D68C03A75C9BB0906EA</td>\n",
371 |        "      <td>140206</td>\n",
372 |        "      <td>在全新的地图中，你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团，肩负起守卫天空之城的重任，同...</td>\n",
373 |        "    </tr>\n",
374 |        "    <tr>\n",
375 |        "      <th>15946</th>\n",
376 |        "      <td>63961F67B88D3D7D877101F80A53E5CD</td>\n",
377 |        "      <td>140901</td>\n",
378 |        "      <td>部分小错误，整体。</td>\n",
379 |        "    </tr>\n",
380 |        "    <tr>\n",
381 |        "      <th>15947</th>\n",
382 |        "      <td>6396C70B6383F0BF243EF69927ACF35F</td>\n",
383 |        "      <td>140901</td>\n",
384 |        "      <td>以太大陆EthMin 以太大陆是一个数字生态世界，帮助个体管理在现实世界中所付出的努力与贡献...</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>15948</th>\n",
388 |        "      <td>6396F4C27E1F1D86762B9283D701DB78</td>\n",
389 |        "      <td>142501</td>\n",
390 |        "      <td>帮助准妈妈在分娩前记录宫缩频率和时长，以判断是否达到就医标准。遇到问题，可1对1在线咨询产科...</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>15949</th>\n",
394 |        "      <td>63997AB7F3E277BC0CB1D42C3D8360F4</td>\n",
395 |        "      <td>142103</td>\n",
396 |        "      <td>线上线下优势资源整合  不必四处奔波，专属咨询顾问为您服务。  安心快速无抵押  去繁求简，...</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>15950</th>\n",
400 |        "      <td>639B889103E0AFD7D23E8C593DB6A6D1</td>\n",
401 |        "      <td>140211</td>\n",
402 |        "      <td>更稳定、更优质，邀您一起。</td>\n",
403 |        "    </tr>\n",
404 |        "    <tr>\n",
405 |        "      <th>15951</th>\n",
406 |        "      <td>639BC48DB51B5806B726B392224F0CA8</td>\n",
407 |        "      <td>142102</td>\n",
408 |        "      <td>金钱永不眠一个股票账户，一笔钱投资美股/港股/英股/A股；全球资产配置的一站式股票平台享受一...</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>15952</th>\n",
412 |        "      <td>639C08D6CA2142E0CFD60E64DFB7C326</td>\n",
413 |        "      <td>140901</td>\n",
414 |        "      <td>文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成，一款专业进行文字转语...</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>15953</th>\n",
418 |        "      <td>639C9663BB3CABFA048B3A54ED9B8CC9</td>\n",
419 |        "      <td>140401</td>\n",
420 |        "      <td>在微博，官方发布新闻，草根爆料八卦；在微博，大V明星发布动态，粉丝狗仔爆料内幕；在微博，海量...</td>\n",
421 |        "    </tr>\n",
422 |        "    <tr>\n",
423 |        "      <th>15954</th>\n",
424 |        "      <td>639DBC25084151D681F73C1A331B6CBA</td>\n",
425 |        "      <td>140210</td>\n",
426 |        "      <td>比斗地主麻将更简单、比炸金花牛牛更刺激，全球百姓共同推荐锻炼情商智商，大奖话费信手拈来。客官...</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>15955</th>\n",
430 |        "      <td>63A0262B4C6D416DC2816B15E716C31D</td>\n",
431 |        "      <td>142103</td>\n",
432 |        "      <td>\"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包，帮您闪电周转...</td>\n",
433 |        "    </tr>\n",
434 |        "    <tr>\n",
435 |        "      <th>15956</th>\n",
436 |        "      <td>63A09CBD3873CE47A42BC285705B8431</td>\n",
437 |        "      <td>140901</td>\n",
438 |        "      <td>这是一款优客工场官方发布的移动客户端，涵盖了优客工场支持的所有辅助功能，旨在为你和你的团队提...</td>\n",
439 |        "    </tr>\n",
440 |        "    <tr>\n",
441 |        "      <th>15957</th>\n",
442 |        "      <td>63A0C80FD1F955F8C53AFE69291EC652</td>\n",
443 |        "      <td>140107</td>\n",
444 |        "      <td>天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>15958</th>\n",
448 |        "      <td>63A2BCCA93BB9ED948A2892CFEF4CFCE</td>\n",
449 |        "      <td>140207</td>\n",
450 |        "      <td>喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>15959</th>\n",
454 |        "      <td>63A9AFD1952D1EF58A3743CA5BD76602</td>\n",
455 |        "      <td>140206</td>\n",
456 |        "      <td>住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...</td>\n",
457 |        "    </tr>\n",
458 |        "    <tr>\n",
459 |        "      <th>15960</th>\n",
460 |        "      <td>63AA2D5AFFD768100625F947BA030B48</td>\n",
461 |        "      <td>142105</td>\n",
462 |        "      <td>一款信息查询辅助应用软件，服务广大福彩彩民，为福彩中心的各种促销活动提供平台支持。主要功能纸...</td>\n",
463 |        "    </tr>\n",
464 |        "    <tr>\n",
465 |        "      <th>15961</th>\n",
466 |        "      <td>63AB66CD8D27C6B269F4960FB530AA76</td>\n",
467 |        "      <td>142104</td>\n",
468 |        "      <td>国美金融App简介国美金融App是综合性金融服务平台，致力为个人和企业提供定制化财富管理服务...</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>15962</th>\n",
472 |        "      <td>63AD9FA5338921C66943390ADA5DCF23</td>\n",
473 |        "      <td>142102</td>\n",
474 |        "      <td>华福证券网上开户2.0系统，采用人脸单向识别技术，优化开户排队机制，可绑定老账户，为您节省宝...</td>\n",
475 |        "    </tr>\n",
476 |        "    <tr>\n",
477 |        "      <th>15963</th>\n",
478 |        "      <td>63AF8C9C9E16F935F8F424533D24FD40</td>\n",
479 |        "      <td>140701</td>\n",
480 |        "      <td>软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...</td>\n",
481 |        "    </tr>\n",
482 |        "    <tr>\n",
483 |        "      <th>15964</th>\n",
484 |        "      <td>63B0E4D5A2319B7684D8959D3703B7C4</td>\n",
485 |        "      <td>140404</td>\n",
486 |        "      <td>曹雪芹著的经典文学名著《红楼梦》，特殊的幼儿早教方式，让您的孩子赢在起跑线上，大人和儿童皆宜...</td>\n",
487 |        "    </tr>\n",
488 |        "    <tr>\n",
489 |        "      <th>15965</th>\n",
490 |        "      <td>63B1568A4BA00BB36247F3FE7E63D046</td>\n",
491 |        "      <td>140210</td>\n",
492 |        "      <td>大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来，锻炼自己，让自己成为万人瞩目的象棋大师...</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <th>15966</th>\n",
496 |        "      <td>63B5F7FD3037C633611E405BF76357A6</td>\n",
497 |        "      <td>140901</td>\n",
498 |        "      <td>流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>15967</th>\n",
502 |        "      <td>63B6A2A65E22AB3BEF8E6E3627058005</td>\n",
503 |        "      <td>140901</td>\n",
504 |        "      <td>三星应用商店GlxyApps是三星官方开发和运营的应用下载平台，拥有数十万款应用、游戏和主题...</td>\n",
505 |        "    </tr>\n",
506 |        "    <tr>\n",
507 |        "      <th>15968</th>\n",
508 |        "      <td>63B8474AE7D557EB69107C1C8D67293B</td>\n",
509 |        "      <td>140212</td>\n",
510 |        "      <td>海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权，对比经典抢...</td>\n",
511 |        "    </tr>\n",
512 |        "    <tr>\n",
513 |        "      <th>15969</th>\n",
514 |        "      <td>63B89ACCF7E7BB4E8048A4430A61198E</td>\n",
515 |        "      <td>140603</td>\n",
516 |        "      <td>《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...</td>\n",
517 |        "    </tr>\n",
518 |        "    <tr>\n",
519 |        "      <th>15970</th>\n",
520 |        "      <td>63BA67638DB01DFD3BE2B89A6DA9C632</td>\n",
521 |        "      <td>140802</td>\n",
522 |        "      <td>乌鲁木齐地铁官方APP，为您提供全新出行方式。地铁购票&amp;mdash;&amp;mdash;乘客可通过...</td>\n",
523 |        "    </tr>\n",
524 |        "    <tr>\n",
525 |        "      <th>15971</th>\n",
526 |        "      <td>63BA6BE8E50C34BC6C08F39487BF3063</td>\n",
527 |        "      <td>140404</td>\n",
528 |        "      <td>产品简介免费追书吧，一款专业免费的电子书阅读软件，爱阅读的小伙伴千万不要错过。全本小说免费阅...</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>15972</th>\n",
532 |        "      <td>63BBCDE7DE3AE668D03FAB004E986F4F</td>\n",
533 |        "      <td>140301|140604</td>\n",
534 |        "      <td>人人是一个火爆校园的高颜值网红美女视频交友直播平台，同城交友约会聊天，明星艺人在线直播，72...</td>\n",
535 |        "    </tr>\n",
536 |        "    <tr>\n",
537 |        "      <th>15973</th>\n",
538 |        "      <td>63BD79538A92F05644BE6AD23D87B545</td>\n",
539 |        "      <td>140603</td>\n",
540 |        "      <td>铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声，轻松...</td>\n",
541 |        "    </tr>\n",
542 |        "    <tr>\n",
543 |        "      <th>15974</th>\n",
544 |        "      <td>63BF35D999C3B21BB0E783CD56FD60D0</td>\n",
545 |        "      <td>140207</td>\n",
546 |        "      <td>《侍灵》是一款日式暗黑系的横版格斗QTE手游，在动作游戏领域中做出了大胆的创新，通过策略搭配...</td>\n",
547 |        "    </tr>\n",
548 |        "    <tr>\n",
549 |        "      <th>15975</th>\n",
550 |        "      <td>63BFFE1204509BBA9BD9E0E406FB2A38</td>\n",
551 |        "      <td>142103</td>\n",
552 |        "      <td>风云管家，信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...</td>\n",
553 |        "    </tr>\n",
554 |        "    <tr>\n",
555 |        "      <th>15976</th>\n",
556 |        "      <td>63C0F5069E829510104C56911CF571D1</td>\n",
557 |        "      <td>140207</td>\n",
558 |        "      <td>两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...</td>\n",
559 |        "    </tr>\n",
560 |        "    <tr>\n",
561 |        "      <th>15977</th>\n",
562 |        "      <td>63C5FA30A92F3B99258FA6085EE90D91</td>\n",
563 |        "      <td>141201</td>\n",
564 |        "      <td>通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...</td>\n",
565 |        "    </tr>\n",
566 |        "    <tr>\n",
567 |        "      <th>15978</th>\n",
568 |        "      <td>63CA760775B2CD3D62995F657568CC8E</td>\n",
569 |        "      <td>141001</td>\n",
570 |        "      <td>宝宝学加减法，是宝宝巴士专为5+宝贝设计的数学软件，让孩子轻松学习加减法，赢在起跑线~难度贴...</td>\n",
571 |        "    </tr>\n",
572 |        "    <tr>\n",
573 |        "      <th>15979</th>\n",
574 |        "      <td>63CB103A546C380870C8A3FA53A14208</td>\n",
575 |        "      <td>140113</td>\n",
576 |        "      <td>长安通APP是一款便民生活服务软件。用户使用长安通APP，可以实现长安通卡NFC充值、查询余...</td>\n",
577 |        "    </tr>\n",
578 |        "  </tbody>\n",
579 |        "</table>\n",
580 |        "</div>"
581 |       ],
582 |       "text/plain": [
583 |        "                                 app_id        type_id  \\\n",
584 |        "15945  63959834D8FB9D68C03A75C9BB0906EA         140206   \n",
585 |        "15946  63961F67B88D3D7D877101F80A53E5CD         140901   \n",
586 |        "15947  6396C70B6383F0BF243EF69927ACF35F         140901   \n",
587 |        "15948  6396F4C27E1F1D86762B9283D701DB78         142501   \n",
588 |        "15949  63997AB7F3E277BC0CB1D42C3D8360F4         142103   \n",
589 |        "15950  639B889103E0AFD7D23E8C593DB6A6D1         140211   \n",
590 |        "15951  639BC48DB51B5806B726B392224F0CA8         142102   \n",
591 |        "15952  639C08D6CA2142E0CFD60E64DFB7C326         140901   \n",
592 |        "15953  639C9663BB3CABFA048B3A54ED9B8CC9         140401   \n",
593 |        "15954  639DBC25084151D681F73C1A331B6CBA         140210   \n",
594 |        "15955  63A0262B4C6D416DC2816B15E716C31D         142103   \n",
595 |        "15956  63A09CBD3873CE47A42BC285705B8431         140901   \n",
596 |        "15957  63A0C80FD1F955F8C53AFE69291EC652         140107   \n",
597 |        "15958  63A2BCCA93BB9ED948A2892CFEF4CFCE         140207   \n",
598 |        "15959  63A9AFD1952D1EF58A3743CA5BD76602         140206   \n",
599 |        "15960  63AA2D5AFFD768100625F947BA030B48         142105   \n",
600 |        "15961  63AB66CD8D27C6B269F4960FB530AA76         142104   \n",
601 |        "15962  63AD9FA5338921C66943390ADA5DCF23         142102   \n",
602 |        "15963  63AF8C9C9E16F935F8F424533D24FD40         140701   \n",
603 |        "15964  63B0E4D5A2319B7684D8959D3703B7C4         140404   \n",
604 |        "15965  63B1568A4BA00BB36247F3FE7E63D046         140210   \n",
605 |        "15966  63B5F7FD3037C633611E405BF76357A6         140901   \n",
606 |        "15967  63B6A2A65E22AB3BEF8E6E3627058005         140901   \n",
607 |        "15968  63B8474AE7D557EB69107C1C8D67293B         140212   \n",
608 |        "15969  63B89ACCF7E7BB4E8048A4430A61198E         140603   \n",
609 |        "15970  63BA67638DB01DFD3BE2B89A6DA9C632         140802   \n",
610 |        "15971  63BA6BE8E50C34BC6C08F39487BF3063         140404   \n",
611 |        "15972  63BBCDE7DE3AE668D03FAB004E986F4F  140301|140604   \n",
612 |        "15973  63BD79538A92F05644BE6AD23D87B545         140603   \n",
613 |        "15974  63BF35D999C3B21BB0E783CD56FD60D0         140207   \n",
614 |        "15975  63BFFE1204509BBA9BD9E0E406FB2A38         142103   \n",
615 |        "15976  63C0F5069E829510104C56911CF571D1         140207   \n",
616 |        "15977  63C5FA30A92F3B99258FA6085EE90D91         141201   \n",
617 |        "15978  63CA760775B2CD3D62995F657568CC8E         141001   \n",
618 |        "15979  63CB103A546C380870C8A3FA53A14208         140113   \n",
619 |        "\n",
620 |        "                                                 app_des  \n",
621 |        "15945  在全新的地图中，你将与戴夫一起面对驾驶飞行器呼啸而来的僵尸军团，肩负起守卫天空之城的重任，同...  \n",
622 |        "15946                                          部分小错误，整体。  \n",
623 |        "15947  以太大陆EthMin 以太大陆是一个数字生态世界，帮助个体管理在现实世界中所付出的努力与贡献...  \n",
624 |        "15948  帮助准妈妈在分娩前记录宫缩频率和时长，以判断是否达到就医标准。遇到问题，可1对1在线咨询产科...  \n",
625 |        "15949  线上线下优势资源整合  不必四处奔波，专属咨询顾问为您服务。  安心快速无抵押  去繁求简，...  \n",
626 |        "15950                                      更稳定、更优质，邀您一起。  \n",
627 |        "15951  金钱永不眠一个股票账户，一笔钱投资美股/港股/英股/A股；全球资产配置的一站式股票平台享受一...  \n",
628 |        "15952  文字转语音合成免费语音翻译、文本朗读、红包口令、普通话吆喝广告音频合成，一款专业进行文字转语...  \n",
629 |        "15953  在微博，官方发布新闻，草根爆料八卦；在微博，大V明星发布动态，粉丝狗仔爆料内幕；在微博，海量...  \n",
630 |        "15954  比斗地主麻将更简单、比炸金花牛牛更刺激，全球百姓共同推荐锻炼情商智商，大奖话费信手拈来。客官...  \n",
631 |        "15955  \"贷款App3分钟下款20000元 通过率高达96.7 贷款借钱急用钱藕丁钱包，帮您闪电周转...  \n",
632 |        "15956  这是一款优客工场官方发布的移动客户端，涵盖了优客工场支持的所有辅助功能，旨在为你和你的团队提...  \n",
633 |        "15957  天天快递是一款兼有邮递功能的门对门物流活动的手机客户端。不仅具有下单、查询订单、搜索商品、晒...  \n",
634 |        "15958  喜羊羊快跑是一款跑酷类的游戏。喜羊羊快跑该游戏让你分分钟忆起我们一起看过的喜羊羊以喜羊羊为主...  \n",
635 |        "15959  住在城市下面鳄鱼Swmpy希望过上人类一样的生活。他非常喜欢干净。可鳄鱼Crnky不满Swm...  \n",
636 |        "15960  一款信息查询辅助应用软件，服务广大福彩彩民，为福彩中心的各种促销活动提供平台支持。主要功能纸...  \n",
637 |        "15961  国美金融App简介国美金融App是综合性金融服务平台，致力为个人和企业提供定制化财富管理服务...  \n",
638 |        "15962  华福证券网上开户2.0系统，采用人脸单向识别技术，优化开户排队机制，可绑定老账户，为您节省宝...  \n",
639 |        "15963  软件简介学霸君1对1学生端是学霸君旗下一款中小学在线1对1辅导应用软件。基于学霸君超过900...  \n",
640 |        "15964  曹雪芹著的经典文学名著《红楼梦》，特殊的幼儿早教方式，让您的孩子赢在起跑线上，大人和儿童皆宜...  \n",
641 |        "15965  大家都想让自己成为一名优秀的象棋手吧那就赶快行动起来，锻炼自己，让自己成为万人瞩目的象棋大师...  \n",
642 |        "15966  流量监控软件是一款功能强大的android流量管理程序。它可以根据不同的android系统版...  \n",
643 |        "15967  三星应用商店GlxyApps是三星官方开发和运营的应用下载平台，拥有数十万款应用、游戏和主题...  \n",
644 |        "15968  海陆空立体战争手游《抢滩登陆3D》是由美国DIGITALFUSION公司正版授权，对比经典抢...  \n",
645 |        "15969  《音乐达人可可摇滚明星MusicIdolCocoRockStr》是一款休闲娱乐游戏。请想象一...  \n",
646 |        "15970  乌鲁木齐地铁官方APP，为您提供全新出行方式。地铁购票&mdash;&mdash;乘客可通过...  \n",
647 |        "15971  产品简介免费追书吧，一款专业免费的电子书阅读软件，爱阅读的小伙伴千万不要错过。全本小说免费阅...  \n",
648 |        "15972  人人是一个火爆校园的高颜值网红美女视频交友直播平台，同城交友约会聊天，明星艺人在线直播，72...  \n",
649 |        "15973  铃声多多手机铃声大全中国移动、中国电信、中国联通3大运营商音乐基地战略合作产品百万铃声，轻松...  \n",
650 |        "15974  《侍灵》是一款日式暗黑系的横版格斗QTE手游，在动作游戏领域中做出了大胆的创新，通过策略搭配...  \n",
651 |        "15975  风云管家，信用卡管家智能还款神器信用卡高端玩家的选择账单全额自动还款2000万实体商户,餐饮...  \n",
652 |        "15976  两种汽车的反应机是很有趣的游戏。你的任务是点击需要的图片点击得比对象快。这款游戏很好的训练注...  \n",
653 |        "15977  通运先培后付学员端通运先培后付App学员端是针对学车人士开发的一款功能性APP。通运学员端引...  \n",
654 |        "15978  宝宝学加减法，是宝宝巴士专为5+宝贝设计的数学软件，让孩子轻松学习加减法，赢在起跑线~难度贴...  \n",
655 |        "15979  长安通APP是一款便民生活服务软件。用户使用长安通APP，可以实现长安通卡NFC充值、查询余...  "
656 |       ]
657 |      },
658 |      "execution_count": 66,
659 |      "metadata": {},
660 |      "output_type": "execute_result"
661 |     }
662 |    ],
663 |    "source": [
664 |     "apptype_train.iloc[15945:15980]"
665 |    ]
666 |   }
667 |  ],
668 |  "metadata": {
669 |   "kernelspec": {
670 |    "display_name": "Python 3",
671 |    "language": "python",
672 |    "name": "python3"
673 |   },
674 |   "language_info": {
675 |    "codemirror_mode": {
676 |     "name": "ipython",
677 |     "version": 3
678 |    },
679 |    "file_extension": ".py",
680 |    "mimetype": "text/x-python",
681 |    "name": "python",
682 |    "nbconvert_exporter": "python",
683 |    "pygments_lexer": "ipython3",
684 |    "version": "3.6.8"
685 |   }
686 |  },
687 |  "nbformat": 4,
688 |  "nbformat_minor": 2
689 | }
690 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # 数据的准备
  3 | 需要下载bert的预训练模型,然后在conf中修改对应的变量pretrained_path
  4 | 
  5 | 下载地址: https://docs.google.com/uc?export=download&id=1W3WgPJWGVKlU9wpUYsdZuurAIFKvrl_Y
  6 | 
  7 | 
  8 | # 运行方式
  9 | ## 环境准备
 10 | *  依赖包的安装
 11 |     
 12 |     pip install -r requirements.txt
 13 |    
 14 | *   GPU的指定
 15 | 
 16 |     修改bert.py中的如下代码
 17 | 
 18 |     os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 19 | 
 20 | *  OOF的生成: BERT
 21 |     
 22 |     以5折的方式运行程序,并且每一折都运行3次
 23 |     
 24 |     nohup ./bin/main.sh 3 &
 25 |     
 26 | 
 27 | *  融合OOF: 对生成的OOF合并生成提交文件:
 28 |     
 29 |     nohup python -u ./core/ensemble_new.py  main  >> ensemble_final.log 2>&1 &
 30 |     
 31 | *  生成的提交文件可见于 ./output/sub/
 32 |     
 33 | 
 34 | # 数据的爬取
 35 | * 百度
 36 | * 应用宝
 37 | * 豌豆荚
 38 | * 小米应用商店
 39 | 
 40 | # 数据清洗
 41 | - 异常数据的发现
 42 | 
 43 |     -  appname 为 #ffababab
 44 |         ffababab	    com.hisense.uifac
 45 |         FACTORY MENU	com.hisense.uifac
 46 |         工厂菜单	        com.hisense.uifac
 47 |     
 48 |     - appname 为 WXEntryActivity (138个)
 49 |         WXEntryActivity	    com.tencent.ft
 50 |         妖精的尾巴：魔导少年	com.tencent.ft
 51 |         
 52 |     - 百度手机助手(*)  
 53 |         百度手机助手(米聊)     
 54 |         百度手机助手(平安金管家)   
 55 |         百度手机助手(掌上电力)  
 56 | 
 57 | 
 58 | 
 59 | - 繁体字
 60 |     所有的繁体字转简体字
 61 | 
 62 | 
 63 | # 数据
 64 | - 数据增强(切割)
 65 | 
 66 |     由于bert等模型,都对输入的seq_len有字数限制. 余下的数据只能浪费.数据增强的一点是对浪费的数据进行利用.
 67 |     
 68 |     数据seq_len不是针对字符串的长度,而是bert对输入转ID之后的长度,否则对英文的影响是巨大的. 
 69 |     
 70 |     具体源码可以参考:get_feature_bert(seq_len)
 71 | 
 72 | - 数据增强(不同源)
 73 | 
 74 |     由于第二期,是自行爬取数据,往往对于一条数据可以从多个源爬取数据.如果只选择一个源,则会对数据进行另外的一种浪费
 75 |     
 76 |     对数据进行选择或者融合,需要对同一条数据查询到的多个结果根据下列因素,赋予不同权重
 77 |     1) input appname 是否等于 output appname
 78 |     2) input appnane 是否包含于 output appname
 79 |     3) input pkg 是否等于 output pkg
 80 |     4) 返回结果的长度
 81 |     5) Appname 是否出现在返回结果中
 82 |     6) 数据源的不同, 百度, 豌豆荚, 小米应用商店, 应用宝
 83 |     
 84 |     
 85 | 
 86 |     
 87 | - 过拟合的避免
 88 | 
 89 |     数据增强后, 如果同一源头的数据如果切分到了训练集和验证集,会造成本地分数虚高.所以需要保证同源数据分到同一fold
 90 | 
 91 | 
 92 | # 算法
 93 | - 不同seq_len
 94 | 
 95 |     不同的seq_len 会有不同的表现,可以择优保留,然后融合
 96 | 
 97 | - 不同模型
 98 | 
 99 |     bert: 由于Bert有字数限制,最长512,并且太长效果并不是特别好.只能对局部信息进行利用
100 |     lstm: LSTM对比Bert可以设置较大的训练窗口,利用大部分数据.和bert进行融合,是一个极大的补充.
101 | 
102 | - 克服抖动
103 | 
104 |     由于利用GPU进行训练,结果往往会有一定的抖动,可以对同一模型训练多轮,择优保留
105 | 
106 | 
107 | # 后处理/融合
108 | 
109 | - 不同数据源
110 | 
111 |     数据增强,不仅仅针对训练集, 对测试集也做了对应的增强. 这样,同一条测试集,会有多个预测结果.这样对这些结果进行加权融合会有比较好的结果.
112 |     
113 | 
114 | - 不同模型
115 | 
116 |     同一条测试集,会有多个模型进行预测,然后进行加权融合.
117 | 
118 | - 不同的切割序列
119 | 
120 |     不同模型,选择不同的seq_len,得到的不同结果,然后融合.
121 | 
122 | # 团队成员
123 |     牛张明
124 |     攻城狮
125 |     周青松  
126 |     林智敏
127 |     罗宾理
128 | 
129 | 
130 |     
131 | 


--------------------------------------------------------------------------------
/readme2.md:
--------------------------------------------------------------------------------
 1 | # Initial
 2 | 
 3 | # 腾讯词向量下载地址
 4 | 
 5 | - https://ai.tencent.com/ailab/nlp/data/Tencent_AILab_ChineseEmbedding.tar.gz
 6 | ln -s ../../word_vec/Tencent_AILab_ChineseEmbedding.txt  Tencent_AILab_ChineseEmbedding.txt
 7 | - https://github.com/Embedding/Chinese-Word-Vectors
 8 |     Zhihu_QA 知乎问答
 9 | 
10 | # 去除引号
11 | sed -i 's/"//g' *.*
12 | 
13 | 
14 | # 停用词
15 | https://github.com/goto456/stopwords
16 | 
17 | 
18 | # Type_id Cnt: 
19 | 152
20 | 
21 | # Bert
22 | 
23 |     wget -q https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
24 |     wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
25 |     the download model will save to folder: input/model/chinese_L-12_H-768_A-12
26 | 
27 | 
28 | # 输入目录
29 | 
30 |     input/
31 |     ├── model
32 |     │   ├── chinese_L-12_H-768_A-12
33 |     │   │   ├── bert_config.json
34 |     │   │   ├── bert_model.ckpt.data-00000-of-00001
35 |     │   │   ├── bert_model.ckpt.index
36 |     │   │   ├── bert_model.ckpt.meta
37 |     │   │   └── vocab.txt
38 |     │   └── uncased_L-12_H-768_A-12.zip
39 |     ├── Tencent_AILab_ChineseEmbedding.txt
40 |     └── zip
41 |         ├── app_desc.dat
42 |         ├── apptype_id_name.txt
43 |         ├── apptype_train.dat
44 |         └── mini.dat
45 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | keras-bert==0.57.1
 2 | jieba
 3 | deprecated
 4 | urllib3
 5 | file_cache==0.1.36
 6 | keras==2.2.4
 7 | tables
 8 | 
 9 | #tensorflow==1.11.0
10 | #tensorflow-gpu==1.11.0
11 | 
12 | tensorflow==1.13.1
13 | tensorflow-gpu==1.13.1
14 | 
15 | 


--------------------------------------------------------------------------------
/spider/gen_file.py:
--------------------------------------------------------------------------------
 1 | from spider.mi import *
 2 | import re
 3 | from core.conf import check_type_list
 4 | 
 5 | if __name__ == '__main__':
 6 |     import sys
 7 | 
 8 |     print(sys.argv)
 9 | 
10 |     #max_len = sys.argv[1]
11 | 
12 |     sys.argv=['a','b']
13 | 
14 |     final = get_final_feature()
15 |     final.desc_name = final.desc_name.str[:4000]
16 | 
17 |     file = './input/zip/apptype_train.dat_p2'
18 | 
19 |     train = final.loc[final.type_id.str.len() > 0]
20 |     train['id'] = 'valxxx' + train['id'].str[6:]
21 |     train.loc[:, ['id', 'type_id', 'desc_name']].to_csv(file, sep='\t', header=None, index=None)
22 |     print(f'save {len(train)} rows to {file} ')
23 | 
24 |     file='./input/zip/app_desc.dat'
25 |     test = final.loc[final.type_id.str.len()==0]#.loc[final.type_id.str.len()==0]
26 |     test = pd.concat([train, test])
27 |     test.loc[:,['id','desc_name']].to_csv(file, sep='\t',header=None, index=None)
28 |     print(f'save {len(test)} rows to {file} ')
29 | 
30 | 
31 | 
32 | 
33 | 
34 | #
35 | # #######################
36 | #     train_list = []
37 | #     for item in check_type_list:
38 | #         if item =='stb':
39 | #             continue
40 | #         final = get_final_feature(item)
41 | #         train = final.loc[final.type_id.str.len() > 0]
42 | #         train['id'] = train['id'].apply(lambda val: item + 'x'*(6-len(item)) + val[6:])
43 | #         train_list.append(train)
44 | #
45 | #     #Stb part
46 | #     stb = pd.read_csv('./input/zip/78_app_desc.dat', sep='\t', header=None)
47 | #     stb.columns = ['id', 'desc_name']
48 | #     tmp = get_train_ph2_index()
49 | #     stb = stb.loc[stb['id'].isin(tmp['id'])]  # .shape
50 | #     stb.head()
51 | #     stb.id = 'stbxxx' + stb.id.str[6:]
52 | #     train_list.append(stb)
53 | #     #Stb_end
54 | #
55 | #     train = pd.concat(train_list)
56 | #     file = f'./input/zip/app_desc.dat'
57 | #
58 | #     train.loc[:, ['id', 'desc_name']].to_csv(file, sep='\t', header=None, index=None)
59 | #     print(f'save to {file} ')
60 | 
61 | """
62 | python spider/gen_file.py
63 | """
64 | 


--------------------------------------------------------------------------------
/tip.md:
--------------------------------------------------------------------------------
 1 | 1)LDA
 2 | 2)分解 tag, find in desc
 3 | 3)辅助特征, 当前相似的词在文中出现的次数
 4 | 4)tf-idf (基于type_name)
 5 | 5)include(total, partial)
 6 | 6)app type_name,分大组
 7 | 7)从desc 过滤相似词
 8 | 8)n-gram 去除?
 9 | 9)手工tocken
10 | 10)解释性
11 | 11)倒序挑选字符串
12 | 
13 | 99998 app_desc.dat
14 | 29999 apptype_train.dat
15 | 
16 | 1134  records have multiply type_id
17 | 
18 | 130097 total
19 | 
20 | 
21 | 异常app:
22 | 1A23C73F4F3E892E2A9DF3C338B80313   -102
23 | D675211C835694A9F096B3AD3C8A9F79   -102
24 | 59913551A1752422F3B191E0C353309D   -102
25 | 6FFCF1564CFA7547DEEEB5DDCC83A24B   -102
26 | E2CC4670C695BFD41FC4ABFDE95C7B36   -102
27 | 0C8C840A534F32D8A608F50D67663E83   -102
28 | D396674F43367C4FDEF82CDA78756D4F   -102
29 | 1F0AF6FA7424660692173FF4134903CB   -102
30 | 
31 | #Manual handle
32 | egrep dkplugin  input/0823/*
33 | 
34 | 
35 | no case for 140208
36 | 
37 | grep 'com.android.iconnect' ./input/0823/*
38 | 
39 | 唐小僧
40 | 
41 | WXEntryActivity
42 | 
43 | 辽宁和教育
44 | 
45 | leak between test and train
46 | 
47 | https://android.myapp.com/myapp/detail.htm?apkName=com.kiees.android
48 | vs
49 | https://android.myapp.com/myapp/detail.htm?apkName=com.kiess
50 | 
51 | 
52 | grep 百度手机助手 ./input/0823/*
53 | 
54 | name first place
55 | 
56 | name partially match
57 | 
58 | 融合,交叉数据源, 去除空格,大小写 一半a, 一半b
59 | 
60 | 
61 | 
62 |  #ffababab       com.hisense.uifac
63 |  FACTORY MENU    com.hisense.uifac
64 |  工厂菜单        com.hisense.uifac
65 | 
66 | 
67 | aiqianjin.jiea
68 | 
69 | WXEntryActivity
70 | 
71 | 开心躲猫猫
72 | 当妈模拟器
73 | 


--------------------------------------------------------------------------------
/zhtools/langconv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from copy import deepcopy
  5 | import re
  6 | 
  7 | try:
  8 |     import psyco
  9 |     psyco.full()
 10 | except:
 11 |     pass
 12 | 
 13 | try:
 14 |     from zh_wiki import zh2Hant, zh2Hans
 15 | except ImportError:
 16 |     from zhtools.zh_wiki import zh2Hant, zh2Hans
 17 | 
 18 | import sys
 19 | py3k = sys.version_info >= (3, 0, 0)
 20 | 
 21 | if py3k:
 22 |     UEMPTY = ''
 23 | else:
 24 |     _zh2Hant, _zh2Hans = {}, {}
 25 |     for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):
 26 |         for k, v in old.items():
 27 |             new[k.decode('utf8')] = v.decode('utf8')
 28 |     zh2Hant = _zh2Hant
 29 |     zh2Hans = _zh2Hans
 30 |     UEMPTY = ''.decode('utf8')
 31 | 
 32 | # states
 33 | (START, END, FAIL, WAIT_TAIL) = list(range(4))
 34 | # conditions
 35 | (TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
 36 | 
 37 | MAPS = {}
 38 | 
 39 | class Node(object):
 40 |     def __init__(self, from_word, to_word=None, is_tail=True,
 41 |             have_child=False):
 42 |         self.from_word = from_word
 43 |         if to_word is None:
 44 |             self.to_word = from_word
 45 |             self.data = (is_tail, have_child, from_word)
 46 |             self.is_original = True
 47 |         else:
 48 |             self.to_word = to_word or from_word
 49 |             self.data = (is_tail, have_child, to_word)
 50 |             self.is_original = False
 51 |         self.is_tail = is_tail
 52 |         self.have_child = have_child
 53 | 
 54 |     def is_original_long_word(self):
 55 |         return self.is_original and len(self.from_word)>1
 56 | 
 57 |     def is_follow(self, chars):
 58 |         return chars != self.from_word[:-1]
 59 | 
 60 |     def __str__(self):
 61 |         return '<Node, %s, %s, %s, %s>' % (repr(self.from_word),
 62 |                 repr(self.to_word), self.is_tail, self.have_child)
 63 | 
 64 |     __repr__ = __str__
 65 | 
 66 | class ConvertMap(object):
 67 |     def __init__(self, name, mapping=None):
 68 |         self.name = name
 69 |         self._map = {}
 70 |         if mapping:
 71 |             self.set_convert_map(mapping)
 72 | 
 73 |     def set_convert_map(self, mapping):
 74 |         convert_map = {}
 75 |         have_child = {}
 76 |         max_key_length = 0
 77 |         for key in sorted(mapping.keys()):
 78 |             if len(key)>1:
 79 |                 for i in range(1, len(key)):
 80 |                     parent_key = key[:i]
 81 |                     have_child[parent_key] = True
 82 |             have_child[key] = False
 83 |             max_key_length = max(max_key_length, len(key))
 84 |         for key in sorted(have_child.keys()):
 85 |             convert_map[key] = (key in mapping, have_child[key],
 86 |                     mapping.get(key, UEMPTY))
 87 |         self._map = convert_map
 88 |         self.max_key_length = max_key_length
 89 | 
 90 |     def __getitem__(self, k):
 91 |         try:
 92 |             is_tail, have_child, to_word  = self._map[k]
 93 |             return Node(k, to_word, is_tail, have_child)
 94 |         except:
 95 |             return Node(k)
 96 | 
 97 |     def __contains__(self, k):
 98 |         return k in self._map
 99 | 
100 |     def __len__(self):
101 |         return len(self._map)
102 | 
103 | class StatesMachineException(Exception): pass
104 | 
105 | class StatesMachine(object):
106 |     def __init__(self):
107 |         self.state = START
108 |         self.final = UEMPTY
109 |         self.len = 0
110 |         self.pool = UEMPTY
111 | 
112 |     def clone(self, pool):
113 |         new = deepcopy(self)
114 |         new.state = WAIT_TAIL
115 |         new.pool = pool
116 |         return new
117 | 
118 |     def feed(self, char, map):
119 |         node = map[self.pool+char]
120 | 
121 |         if node.have_child:
122 |             if node.is_tail:
123 |                 if node.is_original:
124 |                     cond = UNMATCHED_SWITCH
125 |                 else:
126 |                     cond = MATCHED_SWITCH
127 |             else:
128 |                 cond = CONNECTOR
129 |         else:
130 |             if node.is_tail:
131 |                 cond = TAIL
132 |             else:
133 |                 cond = ERROR
134 | 
135 |         new = None
136 |         if cond == ERROR:
137 |             self.state = FAIL
138 |         elif cond == TAIL:
139 |             if self.state == WAIT_TAIL and node.is_original_long_word():
140 |                 self.state = FAIL
141 |             else:
142 |                 self.final += node.to_word
143 |                 self.len += 1
144 |                 self.pool = UEMPTY
145 |                 self.state = END
146 |         elif self.state == START or self.state == WAIT_TAIL:
147 |             if cond == MATCHED_SWITCH:
148 |                 new = self.clone(node.from_word)
149 |                 self.final += node.to_word
150 |                 self.len += 1
151 |                 self.state = END
152 |                 self.pool = UEMPTY
153 |             elif cond == UNMATCHED_SWITCH or cond == CONNECTOR:
154 |                 if self.state == START:
155 |                     new = self.clone(node.from_word)
156 |                     self.final += node.to_word
157 |                     self.len += 1
158 |                     self.state = END
159 |                 else:
160 |                     if node.is_follow(self.pool):
161 |                         self.state = FAIL
162 |                     else:
163 |                         self.pool = node.from_word
164 |         elif self.state == END:
165 |             # END is a new START
166 |             self.state = START
167 |             new = self.feed(char, map)
168 |         elif self.state == FAIL:
169 |             raise StatesMachineException('Translate States Machine '
170 |                     'have error with input data %s' % node)
171 |         return new
172 | 
173 |     def __len__(self):
174 |         return self.len + 1
175 | 
176 |     def __str__(self):
177 |         return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % (
178 |                 id(self), self.pool, self.state, self.final)
179 |     __repr__ = __str__
180 | 
181 | class Converter(object):
182 |     def __init__(self, to_encoding):
183 |         self.to_encoding = to_encoding
184 |         self.map = MAPS[to_encoding]
185 |         self.start()
186 | 
187 |     def feed(self, char):
188 |         branches = []
189 |         for fsm in self.machines:
190 |             new = fsm.feed(char, self.map)
191 |             if new:
192 |                 branches.append(new)
193 |         if branches:
194 |             self.machines.extend(branches)
195 |         self.machines = [fsm for fsm in self.machines if fsm.state != FAIL]
196 |         all_ok = True
197 |         for fsm in self.machines:
198 |             if fsm.state != END:
199 |                 all_ok = False
200 |         if all_ok:
201 |             self._clean()
202 |         return self.get_result()
203 | 
204 |     def _clean(self):
205 |         if len(self.machines):
206 |             self.machines.sort(key=lambda x: len(x))
207 |             # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
208 |             self.final += self.machines[0].final
209 |         self.machines = [StatesMachine()]
210 | 
211 |     def start(self):
212 |         self.machines = [StatesMachine()]
213 |         self.final = UEMPTY
214 | 
215 |     def end(self):
216 |         self.machines = [fsm for fsm in self.machines
217 |                 if fsm.state == FAIL or fsm.state == END]
218 |         self._clean()
219 | 
220 |     def convert(self, string):
221 |         self.start()
222 |         for char in string:
223 |             self.feed(char)
224 |         self.end()
225 |         return self.get_result()
226 | 
227 |     def get_result(self):
228 |         return self.final
229 | 
230 | 
231 | def registery(name, mapping):
232 |     global MAPS
233 |     MAPS[name] = ConvertMap(name, mapping)
234 | 
235 | registery('zh-hant', zh2Hant)
236 | registery('zh-hans', zh2Hans)
237 | del zh2Hant, zh2Hans
238 | 
239 | 
240 | def run():
241 |     import sys
242 |     from optparse import OptionParser
243 |     parser = OptionParser()
244 |     parser.add_option('-e', type='string', dest='encoding',
245 |             help='encoding')
246 |     parser.add_option('-f', type='string', dest='file_in',
247 |             help='input file (- for stdin)')
248 |     parser.add_option('-t', type='string', dest='file_out',
249 |             help='output file')
250 |     (options, args) = parser.parse_args()
251 |     if not options.encoding:
252 |         parser.error('encoding must be set')
253 |     if options.file_in:
254 |         if options.file_in == '-':
255 |             file_in = sys.stdin
256 |         else:
257 |             file_in = open(options.file_in)
258 |     else:
259 |         file_in = sys.stdin
260 |     if options.file_out:
261 |         if options.file_out == '-':
262 |             file_out = sys.stdout
263 |         else:
264 |             file_out = open(options.file_out, 'wb')
265 |     else:
266 |         file_out = sys.stdout
267 | 
268 |     c = Converter(options.encoding)
269 |     for line in file_in:
270 |         # print >> file_out, c.convert(line.rstrip('\n').decode(
271 |         file_out.write(c.convert(line.rstrip('\n').decode(
272 |             'utf8')).encode('utf8'))
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     run()
277 | 


--------------------------------------------------------------------------------
/zhtools/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def Traditional2Simplified(sentence):
 3 |     '''
 4 |     将sentence中的繁体字转为简体字
 5 |     :param sentence: 待转换的句子
 6 |     :return: 将句子中繁体字转换为简体字之后的句子
 7 |     '''
 8 |     from zhtools.langconv import Converter
 9 |     sentence = Converter('zh-hans').convert(sentence)
10 |     return sentence
11 | 
12 | if __name__=="__main__":
13 |     traditional_sentence = '憂郁的臺灣烏龜,百度地圖'
14 |     simplified_sentence = Traditional2Simplified(traditional_sentence)
15 |     print(simplified_sentence)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/大数据标签-讯飞.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flyfoxs/xf_tag/ee3123f10ff884e46084c5c336b4fa792ad741c1/大数据标签-讯飞.pptx


--------------------------------------------------------------------------------