├── presentation.pptx ├── content_risk ├── 网页风险-PAI截图-1.png ├── 网页风险-PAI截图-2.png ├── 网页风险-PAI截图-3.png ├── UDF │ ├── readme.txt │ ├── Title.java │ ├── TitleRaw.java │ ├── LinksRaw.java │ ├── LinkShort.java │ ├── Links.java │ ├── Body.java │ └── Helpers.java └── readme.md └── README.md /presentation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/presentation.pptx -------------------------------------------------------------------------------- /content_risk/网页风险-PAI截图-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-1.png -------------------------------------------------------------------------------- /content_risk/网页风险-PAI截图-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-2.png -------------------------------------------------------------------------------- /content_risk/网页风险-PAI截图-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frank6696/tianchi-aliyun-security-competition/HEAD/content_risk/网页风险-PAI截图-3.png -------------------------------------------------------------------------------- /content_risk/UDF/readme.txt: -------------------------------------------------------------------------------- 1 | UDF 函数与这里 Java 类名的对应关系: 2 | 3 | get_title_raw ~ TitleRaw 4 | get_title ~ Title 5 | get_body ~ Body 6 | get_links_raw ~ LinksRaw 7 | get_links ~ Links 8 | get_links_short ~ LinkShort 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tianchi-aliyun-security-competition 2 | 3 | 第二届阿里云安全算法挑战赛 4 | 5 | 赛题介绍详见: [第二届阿里云安全算法挑战赛 | 赛题与数据](https://tianchi.aliyun.com/competition/information.htm?raceId=231612) 6 | 7 | 共两道题, 一是"扫描爆破拦截", 二是"网页风险识别". 8 | * 第一题 思路在答辩幻灯里有介绍, 代码这里就不具体给出了. 9 | * 第二题 思路介绍/PAI模型/UDF代码见 [content_risk](content_risk/) 子文件夹. 10 | * 答辩幻灯见 [presentation.pptx](presentation.pptx) 11 | -------------------------------------------------------------------------------- /content_risk/UDF/Title.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | 3 | public final class Title extends UDF { 4 | 5 | // title_raw -> title 6 | public static String get(String raw) { 7 | return Helpers.process(raw); 8 | } 9 | 10 | public String evaluate(String s) { 11 | if (s == null) { return null;} 12 | String result = "none"; 13 | try { result = get(s); 14 | } catch (Exception e) { } 15 | return result; 16 | } 17 | 18 | public static void main(String[] args) { } 19 | } 20 | -------------------------------------------------------------------------------- /content_risk/UDF/TitleRaw.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | import org.jsoup.Jsoup; 3 | import org.jsoup.nodes.*; 4 | 5 | public final class TitleRaw extends UDF { 6 | 7 | // html -> title raw 8 | public static String get(String html) { 9 | String result = "none"; 10 | Document doc = Jsoup.parse(html); 11 | if (doc.title() != null) result = doc.title(); 12 | return result; 13 | } 14 | 15 | public String evaluate(String s) { 16 | if (s == null) { return null;} 17 | String result = "none"; 18 | try { result = get(s); 19 | } catch (Exception e) { } 20 | return result; 21 | } 22 | 23 | public static void main(String[] args) { } 24 | } 25 | -------------------------------------------------------------------------------- /content_risk/UDF/LinksRaw.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | import org.jsoup.Jsoup; 3 | import org.jsoup.nodes.*; 4 | import org.jsoup.select.Elements; 5 | 6 | public final class LinksRaw extends UDF { 7 | 8 | public static String get(String html) { 9 | Document doc = Jsoup.parse(html); 10 | Elements links = doc.body().getElementsByTag("a"); 11 | StringBuilder linkTexts = new StringBuilder(""); 12 | 13 | for (Element link : links) { 14 | if (link.text() != null && !link.text().equals("")) { 15 | linkTexts.append("^" + link.text()); 16 | } 17 | } 18 | 19 | String result; 20 | if (linkTexts != null) { 21 | result = linkTexts.toString(); 22 | } 23 | else result = "none"; 24 | return result; 25 | } 26 | 27 | public String evaluate(String s) { 28 | if (s == null) { return null;} 29 | String result = "none"; 30 | try { result = get(s); 31 | } catch (Exception e) { } 32 | return result; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /content_risk/UDF/LinkShort.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | import java.io.FileReader; 3 | import java.util.Scanner; 4 | 5 | public final class LinkShort extends UDF { 6 | 7 | public static String get(String linksRaw) { 8 | String[] links = linksRaw.split("\\^"); 9 | StringBuilder linkTexts = new StringBuilder(""); 10 | 11 | int count = 0; 12 | for (String link : links) { 13 | if (link != null && link.length() <= 5) { 14 | linkTexts.append("," + link); 15 | count += 1; 16 | } 17 | if (count > 40) break; 18 | } 19 | 20 | String result; 21 | if (linkTexts != null) { 22 | result = linkTexts.toString(); 23 | result = Helpers.process(result); 24 | } 25 | else result = "none"; 26 | return result; 27 | } 28 | 29 | public String evaluate(String s) { 30 | if (s == null) { return null;} 31 | String result = "none"; 32 | try { result = get(s); 33 | } catch (Exception e) { } 34 | return result; 35 | } 36 | 37 | public static void main(String[] args) { 38 | String[] filenames = Helpers.getFilenames(100); 39 | 40 | try { 41 | for (String name : filenames) { 42 | if (name == null) break; 43 | else { 44 | System.out.println("\n" + name); 45 | Scanner in = new Scanner(new FileReader(name)); 46 | StringBuilder sb = new StringBuilder(""); 47 | while (in.hasNextLine()) { sb.append(in.nextLine()); } 48 | in.close(); 49 | 50 | String html = sb.toString(); 51 | String linksraw = LinksRaw.get(html); 52 | System.out.println("** linkshort "); 53 | System.out.println(LinkShort.get(linksraw)); 54 | } 55 | } 56 | } catch (Exception e) { System.out.printf(e.getMessage()); } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /content_risk/UDF/Links.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | import java.io.FileReader; 3 | import java.util.Scanner; 4 | 5 | public final class Links extends UDF { 6 | 7 | public static String get(String linksRaw) { 8 | String[] links = linksRaw.split("\\^"); 9 | StringBuilder linkTexts = new StringBuilder(""); 10 | 11 | int count = 0; 12 | for (String link : links) { 13 | if (link != null && link.length() > 5) { 14 | linkTexts.append("," + link); // 如果用空格分隔, 会在处理时被去掉 15 | count += 1; 16 | } 17 | if (count > 80) break; 18 | } 19 | 20 | String result; 21 | if (linkTexts != null) { 22 | result = linkTexts.toString(); 23 | result = Helpers.process(result); 24 | } 25 | else result = "none"; 26 | return result; 27 | } 28 | 29 | public String evaluate(String s) { 30 | if (s == null) { return null;} 31 | String result = "none"; 32 | try { result = get(s); 33 | } catch (Exception e) { } 34 | return result; 35 | } 36 | 37 | public static void main(String[] args) { 38 | String[] filenames = Helpers.getFilenames(100); 39 | 40 | try { 41 | for (String name : filenames) { 42 | if (name == null) break; 43 | else { 44 | System.out.println("\n" + name); 45 | Scanner in = new Scanner(new FileReader(name)); 46 | StringBuilder sb = new StringBuilder(""); 47 | while (in.hasNextLine()) { sb.append(in.nextLine()); } 48 | in.close(); 49 | 50 | String html = sb.toString(); 51 | String linksraw = LinksRaw.get(html); 52 | System.out.println("** link "); 53 | System.out.println(Links.get(linksraw)); 54 | } 55 | } 56 | } catch (Exception e) { System.out.printf(e.getMessage()); } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /content_risk/UDF/Body.java: -------------------------------------------------------------------------------- 1 | import com.aliyun.odps.udf.UDF; 2 | import org.jsoup.Jsoup; 3 | import org.jsoup.nodes.*; 4 | import org.jsoup.select.Elements; 5 | 6 | import java.io.FileReader; 7 | import java.util.Scanner; 8 | 9 | public final class Body extends UDF { 10 | 11 | // html -> body 12 | public static String get(String html) { 13 | String result = "none"; 14 | Document doc = Jsoup.parse(html); 15 | doc.getElementsByAttributeValueMatching("style", ".*display:none.*").empty(); 16 | Elements es = doc.body().getAllElements(); 17 | StringBuilder sb = new StringBuilder(""); 18 | 19 | for (Element e : es) { 20 | String text = e.ownText(); 21 | if (text != null && text.length() > 6 && !e.tagName().equals("a")) { 22 | sb.append(text); 23 | } 24 | if (sb.length() > 2000) { 25 | sb.delete(2000, sb.length() - 1); 26 | break; 27 | } 28 | } 29 | result = sb.toString(); 30 | result = Helpers.process(result); 31 | return result; 32 | } 33 | 34 | public String evaluate(String s) { 35 | if (s == null) { return null;} 36 | String result = "none"; 37 | try { result = get(s); } 38 | catch (Exception e) { } 39 | return result; 40 | } 41 | 42 | public static void main(String[] args) { 43 | String[] filenames = Helpers.getFilenames(100); 44 | 45 | try { 46 | for (String name : filenames) { 47 | if (name == null) break; 48 | else { 49 | System.out.println("\n" + name); 50 | Scanner in = new Scanner(new FileReader(name)); 51 | StringBuilder sb = new StringBuilder(""); 52 | while (in.hasNextLine()) { sb.append(in.nextLine()); } 53 | in.close(); 54 | 55 | String html = sb.toString(); 56 | System.out.println("** body "); 57 | System.out.println(Body.get(html)); 58 | } 59 | } 60 | } catch (Exception e) { System.out.printf(e.getMessage()); } 61 | 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /content_risk/UDF/Helpers.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.util.regex.Matcher; 3 | import java.util.regex.Pattern; 4 | 5 | public final class Helpers { 6 | 7 | // 将 reg 在 text 中的匹配 替换为 subst, 忽略大小写. 8 | public static String replace(String text, String reg, String subst) { 9 | Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); 10 | Matcher m = p.matcher(text); 11 | return m.replaceAll(subst); 12 | } 13 | 14 | // 对文字进行处理, 替换网址/电话等 token, 等等 15 | public static String process(String text) { 16 | if (text == null || text.equals("")) return ""; 17 | String result = text; 18 | 19 | // 网址, 邮箱 20 | String reg_url = "(www|http)[^\\s\\u4e00-\\u9fa5]*"; 21 | result = replace(result, reg_url, " aurl "); 22 | String reg_mail = "\\\\b[\\w.%-]+@[-.\\w]+\\.[A-Za-z]{2,4}\\b"; 23 | result = replace(result, reg_mail, " amail "); 24 | 25 | // 英文句号 26 | result = result.replaceAll("\\.", ""); 27 | 28 | // 空格 29 | result = result.replaceAll("[\\s\\u3000]+",""); 30 | 31 | // 11~7 位数字 32 | result = replace(result, "1\\d{10}", " aphone "); 33 | result = replace(result, "\\d{10}", " dten "); 34 | result = replace(result, "\\d{9}", " dnine "); 35 | result = replace(result, "\\d{8}", " deight "); 36 | result = replace(result, "\\d{7}", " dseven "); 37 | 38 | // 含符号的手机号码 39 | result = replace(result, "1\\d{2}.\\d{4}.\\d{4}", " aphone "); 40 | 41 | // 6~1 位数字 42 | result = replace(result, "\\d{6}", " dsix "); 43 | result = replace(result, "\\d{5}", " dfive "); 44 | result = replace(result, "20[01]\\d{1}", " ayear "); 45 | result = replace(result, "\\d{4}", " dfour "); 46 | result = replace(result, "\\d{3}", " dthree "); 47 | result = replace(result, "\\d{2}", " dtwo "); 48 | result = replace(result, "\\d{1}", " donee "); 49 | 50 | // 时间和日期 51 | result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dfour|dthree|dtwo) : (dtwo|donee) : (dtwo|donee) ", " atime "); 52 | result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dfour|dthree|dtwo) : (dtwo|donee) ", " atime "); 53 | result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dtwo|donee) ", " adate "); 54 | result = result.replaceAll(" ayear [-/] (dtwo|donee) [-/] (dtwo|donee) ", " adate "); 55 | 56 | return result; 57 | } 58 | 59 | public static String[] getFilenames(int limit) { 60 | File file = new File("./"); 61 | File[] files = file.listFiles(); 62 | String[] filenames = new String[100]; 63 | int i = 0; 64 | for (File f : files) { 65 | if (f.isFile() && f.getName().endsWith(".html")) { 66 | filenames[i] = f.getName(); 67 | i += 1; 68 | if (i >= limit) break; 69 | } 70 | } 71 | return filenames; 72 | } 73 | public static void main(String[] args) { 74 | String text = "TEL: 134a9909a3345, 2011, 2032"; 75 | System.out.printf(process(text)); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /content_risk/readme.md: -------------------------------------------------------------------------------- 1 | ### 网页风险: 思路与 PAI 模型详情 2 | 3 | team: MJ_3DSUN 4 | 5 | #### 整体思路 6 | 7 | * 使用带答案的第一季测试集作为训练数据. 8 | * 对 html 进行预处理, 替换数字和网址等, 提取出四块文本 (详见 UDF) 9 | * title 10 | * body: 正文前 2000 字, 不包括链接文字和字数不大于 6 的html元素文本. 11 | * link: 字数大于 5 的长链接文字的拼接, 最多 80 个. 12 | * link_short: 字数不大于 5 的短链接文字的拼接, 最多取 40 个. 13 | * 对以上四块分别取黑样本里文档频率 topN 的词, 作为特征. 按样本中是否包含该词, 特征取值为 1 或 0. 词表大小依次为: 3500, 15000, 15000, 4000. 14 | * 对 title, bod, link 分别取黑样本里文档频率 topN 的 2gram, 作为特征, 取值为 1 或 0. 2gram 词表大小依次分别为: 3500, 15000, 15000. 15 | * 因为 PAI 里没找到对 ngram 进行各样本词频统计的功能, 所以采用了一种权宜的方法, 并不能很完整地捕捉 2gram. (详见后文) 16 | * 以上共计 71000 维稀疏特征. 输入逻辑回归多分类模型进行训练. 17 | * 把白样本拆分为三部分, 训练得到的三个模型做投票. 18 | 19 | 20 | #### 数据预处理 21 | 22 | ```sql 23 | -- 拉取带答案的第一季测试集 (作为训练数据), risk 转为 label 24 | drop table if exists cr_ts1; 25 | create table cr_ts1 as 26 | select id, html, 27 | (case when t2.risk='normal' then 0 28 | when t2.risk='fake_card' then 1 29 | when t2.risk='gambling' then 2 30 | else 3 end) as label 31 | from odps_tc_257100_f673506e024.adl_tianchi_content_risk_testing_phase1_with_answer; 32 | 33 | -- 训练数据去重复 34 | drop table if exists cr_ts1_norep; 35 | create table cr_ts1_norep as 36 | select min(id), html, min(label) 37 | from cr_ts1 38 | group by html; 39 | 40 | -- 从 html 中提取需要的文字 41 | drop table if exists cr_ts1_raw_features; 42 | create table cr_ts1_raw_features as 43 | select id, label, 44 | get_title_raw(html) as title_raw, 45 | get_links_raw(html) as link_raw, 46 | get_body(html) as body 47 | from cr_ts1_norep; 48 | 49 | drop table if exists cr_ts1_text_features; 50 | create table cr_ts1_text_features as 51 | select id, label, body, 52 | get_title(title_raw) as title, 53 | get_links(link_raw) as link, 54 | get_links_short(link_raw) as link_short, 55 | from cr_ts1_raw_features; 56 | 57 | -- 分离黑白样本 58 | drop table if exists cr_ts1_text_features_c0; -- 白样本 59 | create table cr_ts1_text_features_c0 as 60 | select * from cr_ts1_text_features 61 | where label = 0; 62 | drop table if exists cr_ts1_text_features_c123; -- 黑样本 63 | create table cr_ts1_text_features_c123 as 64 | select * from cr_ts1_text_features 65 | where label <> 0; 66 | 67 | -- 处理第二季测试集 68 | drop table if exists cr_test_raw_features; 69 | create table cr_test_raw_features as 70 | select id, 71 | get_title_raw(html) as title_raw, 72 | get_links_raw(html) as link_raw, 73 | get_body(html) as body 74 | from odps_tc_257100_f673506e024.adl_tianchi_content_risk_testing_phase2; 75 | 76 | drop table if exists cr_test_text_features; 77 | create table cr_test_text_features as 78 | select id, body, 79 | get_title(title_raw) as title, 80 | get_links(link_raw) as link, 81 | get_links_short(link_raw) as link_short, 82 | from cr_test_raw_features; 83 | 84 | ``` 85 | 86 | #### 文本特征提取: PAI 截图 87 | 88 | 文本特征提取的 PAI 流程见下图. 89 | 90 | - **图中左/中/右三部分分别为: 训练集白样本处理, 训练集黑样本处理, 测试集样本处理.** 91 | - 源头输入的表分别为: `cr_ts1_text_features_c0`, `cr_ts1_text_features_c123`, `cr_test_text_features` 92 | - **其中每一部分又有四条支线, 分别为 title, link, link_short, body 四块文本.** 93 | 94 | ![pai1](网页风险-PAI截图-1.png) 95 | 96 | 97 | ![pai2](网页风险-PAI截图-2.png) 98 | 99 | 下面按图中的节点名称和顺序进行介绍, 并给出必要的 SQL 代码. 100 | 101 | #### 文本特征提取: 1gram 部分 102 | 103 | ##### 分词 104 | 105 | 节点名称: Split Word-1,2,3 106 | * 分词设定: 识别选项全选, 合并选项全不选, tokenizer 选 TAOBAO_CHN, 其余都不勾选. 107 | 108 | ##### 词频统计 109 | 110 | 分别对 title, link, link_short, body 四个字段进行词频统计. 111 | 112 | ##### 统计文档频率 topN 的词 113 | 114 | 节点名称: SQL脚本-1,2,3,4 115 | 116 | ```sql 117 | select word, count(count) as count_sum from ${t1} 118 | group by word order by count_sum desc limit 3500; 119 | -- 对 title, link, link_short, body 分别取 3500, 15000, 4000, 15000 120 | ``` 121 | 122 | 该脚本输出接一个"增加序号列"节点. 123 | 124 | ##### 区分不同字段的词表 ID 125 | 126 | 节点名称: SQL脚本-5,6,7 127 | 128 | 目的是在最后合并 kv 特征时避免 id 冲突. 129 | 130 | ```sql 131 | select append_id + 10000 as append_id, word from ${t1}; 132 | -- 对 link, link_short, body, 这里分别加上 10000, 40000, 50000 133 | ``` 134 | 135 | ##### 对词频统计结果进行处理 136 | 137 | 节点名称: SQL脚本-8,9,11,10 -24,25,26,27 -29,30,31,32 138 | 139 | ```sql 140 | -- 输入1: 词频统计的输出 141 | -- 输入2: topN 词 (即 SQL脚本-1,2,3,4 的输出) 142 | select a.id, a.word, 1 as count -- 按文档频率统计, 因此词频取为1. 143 | from ${t1} a 144 | join ${t2} b on a.word = b.word; 145 | ``` 146 | 147 | ##### 三元组转 kv 148 | 149 | 节点名称: 三元组转kv-1,2,3,4 -8,9,10,11 -12,13,14,15 150 | * 输入1: 接上一步的 SQL脚本输出 151 | * 输入2: 接区分了 ID 的词表 (即: "增加序列号-1", SQL脚本-5,6,7) 152 | 153 | ##### 合并 kv 特征 154 | 155 | 然后用 SQL 脚本合并这四块 kv 特征列. 对训练集黑/白样本, 用如下 SQL: 156 | 157 | ```sql 158 | -- 图中节点名: 合并4. 159 | select a.id, 160 | a.key_value as kvtitle, 161 | b.key_value as kvlink, 162 | c.key_value as kvlink_short, 163 | d.key_value as kvbod 164 | from ${t1} a 165 | full outer join ${t2} b on a.id = b.id 166 | full outer join ${t3} c on a.id = c.id 167 | full outer join ${t4} d on a.id = d.id; 168 | ``` 169 | 170 | 对测试集样本, 用如下 SQL 171 | ```sql 172 | -- SQL脚本-57 173 | select a.id, 174 | b.key_value as kvtitle, 175 | c.key_value as kvlink, 176 | d.key_value as kvlink_short 177 | from ${t1} a 178 | full outer join ${t2} b on a.id = b.id 179 | full outer join ${t3} c on a.id = c.id 180 | full outer join ${t4} d on a.id = d.id; 181 | 182 | -- SQL脚本-58 183 | select a.*, 184 | b.key_value as kvbod 185 | from ${t1} a 186 | full outer join ${t2} b on a.id = b.id; 187 | ``` 188 | 189 | #### 文本特征提取: 2gram 部分 190 | 191 | ##### 2gram-count 192 | 193 | 2gram count 仅针对 title, link, body 这三个字段. 194 | 195 | 先用 ngram-count 节点, 最大 N-gram 长度取2. 196 | * 输入1: 分词结果 (Split Word-1,2,3) 197 | * 输入2: topN 词 (SQL脚本-1,2,3,4) 198 | 199 | 再用 SQL脚本-13,14,15 分别统计各字段的最高频 topN 2gram: 200 | 201 | ```sql 202 | select regexp_replace(words, ' ', '') as word from ( 203 | select * from ${t1} 204 | where ngram = 2 and regexp_count(words, '.*[a-zA-Z,。[:punct:]]+.*') = 0 205 | order by count desc limit 3500) t; -- 对 title, link, body 分别取 3500, 15000, 15000 206 | ``` 207 | 208 | ##### 2gram 词表增加序号列并区分 ID 209 | 210 | 增加序号列节点: 增加序号列-5,6,7 211 | 212 | 区分 ID: SQL脚本-16,17,18 213 | 214 | ```sql 215 | select append_id + 5000 as append_id, word -- 对 title, link, body 分别取 5000, 25000, 65000 216 | from ${t1}; 217 | ``` 218 | 219 | ##### 分词: 以 2gram 词表为自定义词典 220 | 221 | 分三步, 每一步对一个字段进行分词, 同时以该字段的 topN 2gram 词表作为自定义词典. 222 | * 分词设定: 识别选项全选, 合并选项全不选, tokenizer 选 TAOBAO_CHN, 勾选"过滤全英文"和"过滤标点符号". 223 | * 这种方式并不能完整地统计到所有的 2gram, 仅仅是一种权宜的方法. 224 | 225 | ##### 2gram 统计 226 | 227 | 节点名称: 词频统计-14,15,13 228 | 229 | ##### 2gram 统计结果处理 230 | 231 | 节点名称: SQL脚本-19,20,21. 34,35,36. 39,40,41. 232 | 233 | ```sql 234 | -- 输入1: 2gram 词频统计结果 235 | -- 输入2: 2gram 词表 (SQL脚本-13,14,15 的输出) 236 | select a.id, a.word, 1 as count 237 | from ${t1} a 238 | join ${t2} b on a.word = b.word; 239 | ``` 240 | 241 | ##### 三元组转 kv 242 | 243 | 节点名称: 三元组转kv-5,6,7. 16,17,18. 19,20,21. 244 | * 输入1: 上一步的输出 245 | * 输入2: 区分了 ID 的 2gram 词表 (SQL脚本-16,17,18 的输出) 246 | 247 | ##### 把 2gram kv特征与之前的特征合并 248 | 249 | 节点名称: 合并2gram 250 | 251 | ```sql 252 | -- 输入1: 1gram 合并列的结果 (图中节点 "合并4" 或 "SQL脚本-58" 的输出) 253 | -- 输入2,3,4: 上一步三个字段的输出 254 | select a.*, 255 | b.key_value as kvtitle2, 256 | c.key_value as kvlink2, 257 | d.key_value as kvbod2 258 | from ${t1} a 259 | left outer join ${t2} b on a.id = b.id 260 | left outer join ${t3} c on a.id = c.id 261 | left outer join ${t4} d on a.id = d.id; 262 | ``` 263 | 264 | 训练数据进一步合并 label 列: 265 | ```sql 266 | -- 输入1: "合并2gram" 的输出 267 | -- 输入2: 表 cr_ts1_text_features 268 | select a.*, b.label 269 | from ${t1} a 270 | join ${t2} b on a.id = b.id; 271 | ``` 272 | 273 | #### 数据集构建, 分类器训练和预测 274 | 275 | ![pai3](网页风险-PAI截图-3.png) 276 | 277 | ##### 缺失值填充 278 | 279 | * 填充字段: kvlink, kvbod, kvtitle2, kvlink2, kvbod2 280 | * configs: `kvtitle,null,0:0;kvtitle2,null,5000:0;kvlink,null,10000:0;kvlink2,null,25000:0;kvlink_short,null,40000:0;kvbod,null,50000:0;kvbod2,null,65000:0` 281 | 282 | ##### 数据集构建 283 | 284 | * 训练集白样本三等分. 285 | * 图中节点: 拆分-1: 0.3333. 拆分-2: 0.5. 286 | * 把三份白样本分别与全部黑样本合并, 得到三个训练数据集. 图中节点名: data1, data2, data3 287 | 288 | ##### 逻辑回归多分类与预测 289 | 290 | 稀疏输入. L1正则系数=3. 291 | 292 | 取三个模型预测结果进行投票: 293 | ```sql 294 | select a.id, 295 | decode( 296 | decode(a.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000) 297 | + decode(b.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000) 298 | + decode(c.prediction_result, 0, 1, 1, 10, 2, 100, 3, 1000), 299 | 21, 1, 300 | 30, 1, 301 | 120, 1, 302 | 201, 2, 303 | 210, 2, 304 | 300, 2, 305 | 1110, 2, 306 | 1020, 1, 307 | 1200, 2, 308 | 2001, 3, 309 | 2010, 3, 310 | 2100, 3, 311 | 3000, 3, 312 | 0 313 | ) 314 | as prediction_result 315 | from ${t1} a 316 | join ${t2} b on a.id = b.id 317 | join ${t3} c on a.id = c.id; 318 | ``` 319 | 320 | 取出123类, 把 label 转为 risk, 最后写入 answer 表 321 | 322 | ```sql 323 | select id, 324 | (case when prediction_result=0 then 'normal' 325 | when prediction_result=1 then 'fake_card' 326 | when prediction_result=2 then 'gambling' 327 | when prediction_result=3 then 'sexy' end) as risk 328 | from ${t1} 329 | where prediction_result <> 0; 330 | ``` 331 | 332 | --------------------------------------------------------------------------------