├── presentation.pptx
├── content_risk
    ├── 网页风险-PAI截图-1.png
    ├── 网页风险-PAI截图-2.png
    ├── 网页风险-PAI截图-3.png
    ├── UDF
    │   ├── readme.txt
    │   ├── Title.java
    │   ├── TitleRaw.java
    │   ├── LinksRaw.java
    │   ├── LinkShort.java
    │   ├── Links.java
    │   ├── Body.java
    │   └── Helpers.java
    └── readme.md
└── README.md


/presentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/presentation.pptx


--------------------------------------------------------------------------------
/content_risk/网页风险-PAI截图-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-1.png


--------------------------------------------------------------------------------
/content_risk/网页风险-PAI截图-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-2.png


--------------------------------------------------------------------------------
/content_risk/网页风险-PAI截图-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-3.png


--------------------------------------------------------------------------------
/content_risk/UDF/readme.txt:
--------------------------------------------------------------------------------
1 | UDF 函数与这里 Java 类名的对应关系:
2 | 
3 | get_title_raw ~ TitleRaw
4 | get_title ~ Title
5 | get_body ~ Body
6 | get_links_raw ~ LinksRaw
7 | get_links ~ Links
8 | get_links_short ~ LinkShort
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tianchi-aliyun-security-competition
 2 | 
 3 | 第二届阿里云安全算法挑战赛
 4 | 
 5 | 赛题介绍详见: [第二届阿里云安全算法挑战赛 | 赛题与数据](https://tianchi.aliyun.com/competition/information.htm?raceId=231612)
 6 | 
 7 | 共两道题, 一是"扫描爆破拦截", 二是"网页风险识别".
 8 | * 第一题 思路在答辩幻灯里有介绍, 代码这里就不具体给出了.
 9 | * 第二题 思路介绍/PAI模型/UDF代码见 [content_risk](content_risk/) 子文件夹. 
10 | * 答辩幻灯见 [presentation.pptx](presentation.pptx)
11 | 


--------------------------------------------------------------------------------
/content_risk/UDF/Title.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | 
 3 | public final class Title extends UDF {
 4 | 
 5 |     // title_raw -> title
 6 |     public static String get(String raw) {
 7 |         return Helpers.process(raw);
 8 |     }
 9 | 
10 |     public String evaluate(String s) {
11 |         if (s == null) { return null;}
12 |         String result = "none";
13 |         try { result = get(s);
14 |         } catch (Exception e) { }
15 |         return result;
16 |     }
17 | 
18 |     public static void main(String[] args) { }
19 | }
20 | 


--------------------------------------------------------------------------------
/content_risk/UDF/TitleRaw.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | import org.jsoup.Jsoup;
 3 | import org.jsoup.nodes.*;
 4 | 
 5 | public final class TitleRaw extends UDF {
 6 | 
 7 |     // html -> title raw
 8 |     public static String get(String html) {
 9 |         String result = "none";
10 |         Document doc = Jsoup.parse(html);
11 |         if (doc.title() != null) result = doc.title();
12 |         return result;
13 |     }
14 | 
15 |     public String evaluate(String s) {
16 |         if (s == null) { return null;}
17 |         String result = "none";
18 |         try { result = get(s);
19 |         } catch (Exception e) { }
20 |         return result;
21 |     }
22 | 
23 |     public static void main(String[] args) { }
24 | }
25 | 


--------------------------------------------------------------------------------
/content_risk/UDF/LinksRaw.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | import org.jsoup.Jsoup;
 3 | import org.jsoup.nodes.*;
 4 | import org.jsoup.select.Elements;
 5 | 
 6 | public final class LinksRaw extends UDF {
 7 | 
 8 |     public static String get(String html) {
 9 |         Document doc = Jsoup.parse(html);
10 |         Elements links = doc.body().getElementsByTag("a");
11 |         StringBuilder linkTexts = new StringBuilder("");
12 | 
13 |         for (Element link : links) {
14 |             if (link.text() != null && !link.text().equals("")) {
15 |                 linkTexts.append("^" + link.text());
16 |             }
17 |         }
18 | 
19 |         String result;
20 |         if (linkTexts != null) {
21 |             result = linkTexts.toString();
22 |         }
23 |         else result = "none";
24 |         return result;
25 |     }
26 | 
27 |     public String evaluate(String s) {
28 |         if (s == null) { return null;}
29 |         String result = "none";
30 |         try { result = get(s);
31 |         } catch (Exception e) { }
32 |         return result;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/content_risk/UDF/LinkShort.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | import java.io.FileReader;
 3 | import java.util.Scanner;
 4 | 
 5 | public final class LinkShort extends UDF {
 6 | 
 7 |     public static String get(String linksRaw) {
 8 |         String[] links = linksRaw.split("\\^");
 9 |         StringBuilder linkTexts = new StringBuilder("");
10 | 
11 |         int count = 0;
12 |         for (String link : links) {
13 |             if (link != null && link.length() <= 5) {
14 |                 linkTexts.append("," + link);
15 |                 count += 1;
16 |             }
17 |             if (count > 40) break;
18 |         }
19 | 
20 |         String result;
21 |         if (linkTexts != null) {
22 |             result = linkTexts.toString();
23 |             result = Helpers.process(result);
24 |         }
25 |         else result = "none";
26 |         return result;
27 |     }
28 | 
29 |     public String evaluate(String s) {
30 |         if (s == null) { return null;}
31 |         String result = "none";
32 |         try { result = get(s);
33 |         } catch (Exception e) { }
34 |         return result;
35 |     }
36 | 
37 |     public static void main(String[] args) {
38 |         String[] filenames = Helpers.getFilenames(100);
39 | 
40 |         try {
41 |             for (String name : filenames) {
42 |                 if (name == null) break;
43 |                 else {
44 |                     System.out.println("\n" + name);
45 |                     Scanner in = new Scanner(new FileReader(name));
46 |                     StringBuilder sb = new StringBuilder("");
47 |                     while (in.hasNextLine()) { sb.append(in.nextLine()); }
48 |                     in.close();
49 | 
50 |                     String html = sb.toString();
51 |                     String linksraw = LinksRaw.get(html);
52 |                     System.out.println("** linkshort ");
53 |                     System.out.println(LinkShort.get(linksraw));
54 |                 }
55 |             }
56 |         } catch (Exception e) { System.out.printf(e.getMessage()); }
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/content_risk/UDF/Links.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | import java.io.FileReader;
 3 | import java.util.Scanner;
 4 | 
 5 | public final class Links extends UDF {
 6 | 
 7 |     public static String get(String linksRaw) {
 8 |         String[] links = linksRaw.split("\\^");
 9 |         StringBuilder linkTexts = new StringBuilder("");
10 | 
11 |         int count = 0;
12 |         for (String link : links) {
13 |             if (link != null && link.length() > 5) {
14 |                 linkTexts.append("," + link);  // 如果用空格分隔, 会在处理时被去掉
15 |                 count += 1;
16 |             }
17 |             if (count > 80) break;
18 |         }
19 | 
20 |         String result;
21 |         if (linkTexts != null) {
22 |             result = linkTexts.toString();
23 |             result = Helpers.process(result);
24 |         }
25 |         else result = "none";
26 |         return result;
27 |     }
28 | 
29 |     public String evaluate(String s) {
30 |         if (s == null) { return null;}
31 |         String result = "none";
32 |         try { result = get(s);
33 |         } catch (Exception e) { }
34 |         return result;
35 |     }
36 | 
37 |     public static void main(String[] args) {
38 |         String[] filenames = Helpers.getFilenames(100);
39 | 
40 |         try {
41 |             for (String name : filenames) {
42 |                 if (name == null) break;
43 |                 else {
44 |                     System.out.println("\n" + name);
45 |                     Scanner in = new Scanner(new FileReader(name));
46 |                     StringBuilder sb = new StringBuilder("");
47 |                     while (in.hasNextLine()) { sb.append(in.nextLine()); }
48 |                     in.close();
49 | 
50 |                     String html = sb.toString();
51 |                     String linksraw = LinksRaw.get(html);
52 |                     System.out.println("** link ");
53 |                     System.out.println(Links.get(linksraw));
54 |                 }
55 |             }
56 |         } catch (Exception e) { System.out.printf(e.getMessage()); }
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/content_risk/UDF/Body.java:
--------------------------------------------------------------------------------
 1 | import com.aliyun.odps.udf.UDF;
 2 | import org.jsoup.Jsoup;
 3 | import org.jsoup.nodes.*;
 4 | import org.jsoup.select.Elements;
 5 | 
 6 | import java.io.FileReader;
 7 | import java.util.Scanner;
 8 | 
 9 | public final class Body extends UDF {
10 | 
11 |     // html -> body
12 |     public static String get(String html) {
13 |         String result = "none";
14 |         Document doc = Jsoup.parse(html);
15 |         doc.getElementsByAttributeValueMatching("style", ".*display:none.*").empty();
16 |         Elements es = doc.body().getAllElements();
17 |         StringBuilder sb = new StringBuilder("");
18 | 
19 |         for (Element e : es) {
20 |             String text = e.ownText();
21 |             if (text != null && text.length() > 6 && !e.tagName().equals("a")) {
22 |                 sb.append(text);
23 |             }
24 |             if (sb.length() > 2000) {
25 |                 sb.delete(2000, sb.length() - 1);
26 |                 break;
27 |             }
28 |         }
29 |         result = sb.toString();
30 |         result = Helpers.process(result);
31 |         return result;
32 |     }
33 | 
34 |     public String evaluate(String s) {
35 |         if (s == null) { return null;}
36 |         String result = "none";
37 |         try { result = get(s); }
38 |         catch (Exception e) { }
39 |         return result;
40 |     }
41 | 
42 |     public static void main(String[] args) {
43 |         String[] filenames = Helpers.getFilenames(100);
44 | 
45 |         try {
46 |             for (String name : filenames) {
47 |                 if (name == null) break;
48 |                 else {
49 |                     System.out.println("\n" + name);
50 |                     Scanner in = new Scanner(new FileReader(name));
51 |                     StringBuilder sb = new StringBuilder("");
52 |                     while (in.hasNextLine()) { sb.append(in.nextLine()); }
53 |                     in.close();
54 | 
55 |                     String html = sb.toString();
56 |                     System.out.println("** body ");
57 |                     System.out.println(Body.get(html));
58 |                 }
59 |             }
60 |         } catch (Exception e) { System.out.printf(e.getMessage()); }
61 | 
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/content_risk/UDF/Helpers.java:
--------------------------------------------------------------------------------
 1 | import java.io.File;
 2 | import java.util.regex.Matcher;
 3 | import java.util.regex.Pattern;
 4 | 
 5 | public final class Helpers {
 6 | 
 7 |     // 将 reg 在 text 中的匹配 替换为 subst, 忽略大小写.
 8 |     public static String replace(String text, String reg, String subst) {
 9 |         Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);
10 |         Matcher m = p.matcher(text);
11 |         return m.replaceAll(subst);
12 |     }
13 | 
14 |     // 对文字进行处理, 替换网址/电话等 token, 等等
15 |     public static String process(String text) {
16 |         if (text == null || text.equals("")) return "";
17 |         String result = text;
18 | 
19 |         // 网址, 邮箱
20 |         String reg_url = "(www|http)[^\\s\\u4e00-\\u9fa5]*";
21 |         result = replace(result, reg_url, " aurl ");
22 |         String reg_mail = "\\\\b[\\w.%-]+@[-.\\w]+\\.[A-Za-z]{2,4}\\b";
23 |         result = replace(result, reg_mail, " amail ");
24 | 
25 |         // 英文句号
26 |         result = result.replaceAll("\\.", "");
27 | 
28 |         // 空格
29 |         result = result.replaceAll("[\\s\\u3000]+","");
30 | 
31 |         // 11~7 位数字
32 |         result = replace(result, "1\\d{10}", " aphone ");
33 |         result = replace(result, "\\d{10}", " dten ");
34 |         result = replace(result, "\\d{9}", " dnine ");
35 |         result = replace(result, "\\d{8}", " deight ");
36 |         result = replace(result, "\\d{7}", " dseven ");
37 | 
38 |         // 含符号的手机号码
39 |         result = replace(result, "1\\d{2}.\\d{4}.\\d{4}", " aphone ");
40 | 
41 |         // 6~1 位数字
42 |         result = replace(result, "\\d{6}", " dsix ");
43 |         result = replace(result, "\\d{5}", " dfive ");
44 |         result = replace(result, "20[01]\\d{1}", " ayear ");
45 |         result = replace(result, "\\d{4}", " dfour ");
46 |         result = replace(result, "\\d{3}", " dthree ");
47 |         result = replace(result, "\\d{2}", " dtwo ");
48 |         result = replace(result, "\\d{1}", " donee ");
49 | 
50 |         // 时间和日期
51 |         result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dfour|dthree|dtwo) : (dtwo|donee) : (dtwo|donee) ", " atime ");
52 |         result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dfour|dthree|dtwo) : (dtwo|donee) ", " atime ");
53 |         result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dtwo|donee) ", " adate ");
54 |         result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dtwo|donee) ", " adate ");
55 | 
56 |         return result;
57 |     }
58 | 
59 |     public static String[] getFilenames(int limit) {
60 |         File file = new File("./");
61 |         File[] files = file.listFiles();
62 |         String[] filenames = new String[100];
63 |         int i = 0;
64 |         for (File f : files) {
65 |             if (f.isFile() && f.getName().endsWith(".html")) {
66 |                 filenames[i] = f.getName();
67 |                 i += 1;
68 |                 if (i >= limit) break;
69 |             }
70 |         }
71 |         return filenames;
72 |     }
73 |     public static void main(String[] args) {
74 |         String text = "TEL: 134a9909a3345, 2011, 2032";
75 |         System.out.printf(process(text));
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/content_risk/readme.md:
--------------------------------------------------------------------------------
  1 | ### 网页风险: 思路与 PAI 模型详情
  2 | 
  3 | team: MJ_3DSUN
  4 | 
  5 | #### 整体思路
  6 | 
  7 | * 使用带答案的第一季测试集作为训练数据.
  8 | * 对 html 进行预处理, 替换数字和网址等, 提取出四块文本 (详见 UDF)
  9 |   * title
 10 |   * body: 正文前 2000 字, 不包括链接文字和字数不大于 6 的html元素文本.
 11 |   * link: 字数大于 5 的长链接文字的拼接, 最多 80 个.
 12 |   * link_short: 字数不大于 5 的短链接文字的拼接, 最多取 40 个.
 13 | * 对以上四块分别取黑样本里文档频率 topN 的词, 作为特征. 按样本中是否包含该词, 特征取值为 1 或 0. 词表大小依次为: 3500, 15000, 15000, 4000.
 14 | * 对 title, bod, link 分别取黑样本里文档频率 topN 的 2gram, 作为特征, 取值为 1 或 0. 2gram 词表大小依次分别为: 3500, 15000, 15000.
 15 |   * 因为 PAI 里没找到对 ngram 进行各样本词频统计的功能, 所以采用了一种权宜的方法, 并不能很完整地捕捉 2gram. (详见后文)
 16 | * 以上共计 71000 维稀疏特征. 输入逻辑回归多分类模型进行训练.
 17 | * 把白样本拆分为三部分, 训练得到的三个模型做投票.
 18 | 
 19 | 
 20 | #### 数据预处理
 21 | 
 22 | ```sql
 23 | -- 拉取带答案的第一季测试集 (作为训练数据), risk 转为 label
 24 | drop table if exists cr_ts1;
 25 | create table cr_ts1 as
 26 |   select id, html,
 27 |   (case when t2.risk='normal' then 0
 28 |         when t2.risk='fake_card' then 1
 29 |         when t2.risk='gambling' then 2
 30 |         else 3 end) as label
 31 |   from odps_tc_257100_f673506e024.adl_tianchi_content_risk_testing_phase1_with_answer;
 32 | 
 33 | -- 训练数据去重复
 34 | drop table if exists cr_ts1_norep;
 35 | create table cr_ts1_norep as
 36 |   select min(id), html, min(label)
 37 |   from cr_ts1
 38 |   group by html;
 39 | 
 40 | -- 从 html 中提取需要的文字
 41 | drop table if exists cr_ts1_raw_features;
 42 | create table cr_ts1_raw_features as
 43 |   select id, label,
 44 |   get_title_raw(html) as title_raw,
 45 |   get_links_raw(html) as link_raw,
 46 |   get_body(html) as body
 47 |   from cr_ts1_norep;
 48 | 
 49 | drop table if exists cr_ts1_text_features;
 50 | create table cr_ts1_text_features as
 51 |   select id, label, body,
 52 |   get_title(title_raw) as title,
 53 |   get_links(link_raw) as link,
 54 |   get_links_short(link_raw) as link_short,
 55 |   from cr_ts1_raw_features;
 56 | 
 57 | -- 分离黑白样本
 58 | drop table if exists cr_ts1_text_features_c0;  -- 白样本
 59 | create table cr_ts1_text_features_c0 as
 60 |   select * from cr_ts1_text_features
 61 |   where label = 0;
 62 | drop table if exists cr_ts1_text_features_c123;  -- 黑样本
 63 | create table cr_ts1_text_features_c123 as
 64 |   select * from cr_ts1_text_features
 65 |   where label <> 0;
 66 | 
 67 | -- 处理第二季测试集
 68 | drop table if exists cr_test_raw_features;
 69 | create table cr_test_raw_features as
 70 |   select id,
 71 |   get_title_raw(html) as title_raw,
 72 |   get_links_raw(html) as link_raw,
 73 |   get_body(html) as body
 74 |   from odps_tc_257100_f673506e024.adl_tianchi_content_risk_testing_phase2;
 75 | 
 76 | drop table if exists cr_test_text_features;
 77 | create table cr_test_text_features as
 78 |   select id, body,
 79 |   get_title(title_raw) as title,
 80 |   get_links(link_raw) as link,
 81 |   get_links_short(link_raw) as link_short,
 82 |   from cr_test_raw_features;
 83 | 
 84 | ```
 85 | 
 86 | #### 文本特征提取: PAI 截图
 87 | 
 88 | 文本特征提取的 PAI 流程见下图. 
 89 | 
 90 | - **图中左/中/右三部分分别为: 训练集白样本处理, 训练集黑样本处理, 测试集样本处理.**
 91 |   - 源头输入的表分别为: `cr_ts1_text_features_c0`, `cr_ts1_text_features_c123`, `cr_test_text_features`
 92 | - **其中每一部分又有四条支线, 分别为 title, link, link_short, body 四块文本.**
 93 | 
 94 | ![pai1](网页风险-PAI截图-1.png)
 95 | 
 96 | 
 97 | ![pai2](网页风险-PAI截图-2.png)
 98 | 
 99 | 下面按图中的节点名称和顺序进行介绍, 并给出必要的 SQL 代码.
100 | 
101 | #### 文本特征提取: 1gram 部分
102 | 
103 | ##### 分词
104 | 
105 | 节点名称: Split Word-1,2,3
106 | * 分词设定: 识别选项全选, 合并选项全不选, tokenizer 选 TAOBAO_CHN, 其余都不勾选.
107 | 
108 | ##### 词频统计
109 | 
110 | 分别对 title, link, link_short, body 四个字段进行词频统计.
111 | 
112 | ##### 统计文档频率 topN 的词
113 | 
114 | 节点名称: SQL脚本-1,2,3,4
115 | 
116 | ```sql
117 | select word, count(count) as count_sum from ${t1}
118 | group by word order by count_sum desc limit 3500;
119 | -- 对 title, link, link_short, body 分别取 3500, 15000, 4000, 15000
120 | ```
121 | 
122 | 该脚本输出接一个"增加序号列"节点.
123 | 
124 | ##### 区分不同字段的词表 ID
125 | 
126 | 节点名称: SQL脚本-5,6,7
127 | 
128 | 目的是在最后合并 kv 特征时避免 id 冲突.
129 | 
130 | ```sql
131 | select append_id + 10000 as append_id, word from ${t1};
132 | -- 对 link, link_short, body, 这里分别加上 10000, 40000, 50000
133 | ```
134 | 
135 | ##### 对词频统计结果进行处理
136 | 
137 | 节点名称: SQL脚本-8,9,11,10 -24,25,26,27 -29,30,31,32
138 | 
139 | ```sql
140 | -- 输入1: 词频统计的输出
141 | -- 输入2: topN 词 (即 SQL脚本-1,2,3,4 的输出)
142 | select a.id, a.word, 1 as count -- 按文档频率统计, 因此词频取为1.
143 | from ${t1} a
144 | join ${t2} b on a.word = b.word;
145 | ```
146 | 
147 | ##### 三元组转 kv
148 | 
149 | 节点名称: 三元组转kv-1,2,3,4 -8,9,10,11 -12,13,14,15
150 | * 输入1: 接上一步的 SQL脚本输出
151 | * 输入2: 接区分了 ID 的词表 (即: "增加序列号-1", SQL脚本-5,6,7)
152 | 
153 | ##### 合并 kv 特征
154 | 
155 | 然后用 SQL 脚本合并这四块 kv 特征列. 对训练集黑/白样本, 用如下 SQL:
156 | 
157 | ```sql
158 | -- 图中节点名: 合并4.
159 | select a.id,
160 | a.key_value as kvtitle, 
161 | b.key_value as kvlink, 
162 | c.key_value as kvlink_short,
163 | d.key_value as kvbod
164 | from ${t1} a 
165 | full outer join ${t2} b on a.id = b.id 
166 | full outer join ${t3} c on a.id = c.id 
167 | full outer join ${t4} d on a.id = d.id;
168 | ```
169 | 
170 | 对测试集样本, 用如下 SQL
171 | ```sql
172 | -- SQL脚本-57
173 | select a.id,
174 | b.key_value as kvtitle, 
175 | c.key_value as kvlink, 
176 | d.key_value as kvlink_short
177 | from ${t1} a
178 | full outer join ${t2} b on a.id = b.id
179 | full outer join ${t3} c on a.id = c.id
180 | full outer join ${t4} d on a.id = d.id;
181 | 
182 | -- SQL脚本-58
183 | select a.*,
184 | b.key_value as kvbod
185 | from ${t1} a
186 | full outer join ${t2} b on a.id = b.id;
187 | ```
188 | 
189 | #### 文本特征提取: 2gram 部分
190 | 
191 | ##### 2gram-count
192 | 
193 | 2gram count 仅针对 title, link, body 这三个字段.
194 | 
195 | 先用 ngram-count 节点, 最大 N-gram 长度取2.
196 | * 输入1: 分词结果 (Split Word-1,2,3)
197 | * 输入2: topN 词 (SQL脚本-1,2,3,4)
198 | 
199 | 再用 SQL脚本-13,14,15 分别统计各字段的最高频 topN 2gram:
200 | 
201 | ```sql
202 | select regexp_replace(words, ' ', '') as word from (
203 |   select * from ${t1}
204 |   where ngram = 2 and regexp_count(words, '.*[a-zA-Z，。[:punct:]]+.*') = 0
205 |   order by count desc limit 3500) t;  -- 对 title, link, body 分别取 3500, 15000, 15000
206 | ```
207 | 
208 | ##### 2gram 词表增加序号列并区分 ID
209 | 
210 | 增加序号列节点: 增加序号列-5,6,7
211 | 
212 | 区分 ID: SQL脚本-16,17,18
213 | 
214 | ```sql
215 | select append_id + 5000 as append_id, word  -- 对 title, link, body 分别取 5000, 25000, 65000
216 | from ${t1};
217 | ```
218 | 
219 | ##### 分词: 以 2gram 词表为自定义词典
220 | 
221 | 分三步, 每一步对一个字段进行分词, 同时以该字段的 topN 2gram 词表作为自定义词典.
222 | * 分词设定: 识别选项全选, 合并选项全不选, tokenizer 选 TAOBAO_CHN, 勾选"过滤全英文"和"过滤标点符号".
223 | * 这种方式并不能完整地统计到所有的 2gram, 仅仅是一种权宜的方法.
224 | 
225 | ##### 2gram 统计
226 | 
227 | 节点名称: 词频统计-14,15,13
228 | 
229 | ##### 2gram 统计结果处理
230 | 
231 | 节点名称: SQL脚本-19,20,21. 34,35,36. 39,40,41.
232 | 
233 | ```sql
234 | -- 输入1: 2gram 词频统计结果
235 | -- 输入2: 2gram 词表 (SQL脚本-13,14,15 的输出)
236 | select a.id, a.word, 1 as count 
237 | from ${t1} a
238 | join ${t2} b on a.word = b.word;
239 | ```
240 | 
241 | ##### 三元组转 kv
242 | 
243 | 节点名称: 三元组转kv-5,6,7. 16,17,18. 19,20,21.
244 | * 输入1: 上一步的输出
245 | * 输入2: 区分了 ID 的 2gram 词表 (SQL脚本-16,17,18 的输出)
246 | 
247 | ##### 把 2gram kv特征与之前的特征合并
248 | 
249 | 节点名称: 合并2gram
250 | 
251 | ```sql
252 | -- 输入1: 1gram 合并列的结果 (图中节点 "合并4" 或 "SQL脚本-58" 的输出)
253 | -- 输入2,3,4: 上一步三个字段的输出
254 | select a.*,
255 | b.key_value as kvtitle2, 
256 | c.key_value as kvlink2,
257 | d.key_value as kvbod2
258 | from ${t1} a
259 | left outer join ${t2} b on a.id = b.id
260 | left outer join ${t3} c on a.id = c.id
261 | left outer join ${t4} d on a.id = d.id;
262 | ```
263 | 
264 | 训练数据进一步合并 label 列:
265 | ```sql
266 | -- 输入1: "合并2gram" 的输出
267 | -- 输入2: 表 cr_ts1_text_features
268 | select a.*, b.label
269 | from ${t1} a 
270 | join ${t2} b on a.id = b.id;
271 | ```
272 | 
273 | #### 数据集构建, 分类器训练和预测
274 | 
275 | ![pai3](网页风险-PAI截图-3.png)
276 | 
277 | ##### 缺失值填充
278 | 
279 | * 填充字段: kvlink, kvbod, kvtitle2, kvlink2, kvbod2
280 | * configs: `kvtitle,null,0:0;kvtitle2,null,5000:0;kvlink,null,10000:0;kvlink2,null,25000:0;kvlink_short,null,40000:0;kvbod,null,50000:0;kvbod2,null,65000:0`
281 | 
282 | ##### 数据集构建
283 | 
284 | * 训练集白样本三等分.
285 |   * 图中节点: 拆分-1: 0.3333. 拆分-2: 0.5.
286 | * 把三份白样本分别与全部黑样本合并, 得到三个训练数据集. 图中节点名: data1, data2, data3
287 | 
288 | ##### 逻辑回归多分类与预测
289 | 
290 | 稀疏输入. L1正则系数=3.
291 | 
292 | 取三个模型预测结果进行投票:
293 | ```sql
294 | select a.id,
295 | decode(
296 |   decode(a.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000)
297 |   + decode(b.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000)
298 |   + decode(c.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000),
299 |   21, 1,
300 |   30, 1,
301 |   120, 1,
302 |   201, 2,
303 |   210, 2,
304 |   300, 2,
305 |   1110, 2,
306 |   1020, 1,
307 |   1200, 2,
308 |   2001, 3,
309 |   2010, 3,
310 |   2100, 3,
311 |   3000, 3,
312 |   0
313 | )
314 | as prediction_result
315 | from ${t1} a
316 | join ${t2} b on a.id = b.id
317 | join ${t3} c on a.id = c.id;
318 | ```
319 | 
320 | 取出123类, 把 label 转为 risk, 最后写入 answer 表
321 | 
322 | ```sql
323 | select id,
324 |   (case when prediction_result=0 then 'normal'
325 |         when prediction_result=1 then 'fake_card'
326 |         when prediction_result=2 then 'gambling'
327 |         when prediction_result=3 then 'sexy' end) as risk
328 | from ${t1}
329 | where prediction_result <> 0;
330 | ```
331 | 
332 | 


--------------------------------------------------------------------------------