├── README.md
├── data
    └── hanzi.txt
├── logs
    ├── 20200606215836.log
    ├── 20200606215839.log
    ├── 20200606215842.log
    ├── 20200606215845.log
    └── 20200606215846.log
├── pom.xml
├── src
    └── main
    │   ├── java
    │       └── LogGenerator.java
    │   ├── resources
    │       └── log4j.properties
    │   └── scala
    │       ├── project1_logAnalysis
    │           ├── LogAnalysis.scala
    │           └── db.sql
    │       └── project2_wordcounts
    │           ├── TestMain.scala
    │           ├── db.sql
    │           ├── jiebaService
    │               ├── __pycache__
    │               │   └── jieba.cpython-36.pyc
    │               └── jieba.py
    │           ├── streamingWC
    │               ├── dao
    │               │   └── MysqlPool.scala
    │               ├── service
    │               │   ├── MysqlService.scala
    │               │   └── SegmentService.scala
    │               └── utils
    │               │   ├── BroadcastWrapper.scala
    │               │   ├── Conf.scala
    │               │   └── TimeParse.scala
    │           └── wordSeqGenerator
    │               ├── ConsumerTest.scala
    │               └── WordSeqProducer.scala
├── streaming74projects.iml
└── target
    └── classes
        ├── LogGenerator.class
        ├── project1_logAnalysis
            ├── LogAnalysis$$anonfun$main$1$$typecreator3$1.class
            ├── LogAnalysis$$anonfun$main$1.class
            ├── LogAnalysis$.class
            ├── LogAnalysis.class
            ├── Record$.class
            ├── Record.class
            └── db.sql
        └── project2_wordcounts
            ├── jiebaService
                └── jieba.py
            ├── streamingWC
                ├── dao
                │   ├── MysqlManager$.class
                │   ├── MysqlManager.class
                │   ├── MysqlPool$$anonfun$1.class
                │   ├── MysqlPool$$anonfun$2.class
                │   ├── MysqlPool$$anonfun$3.class
                │   └── MysqlPool.class
                └── utils
                │   ├── BroadcastWrapper$.class
                │   ├── BroadcastWrapper.class
                │   ├── Conf$.class
                │   ├── Conf.class
                │   ├── TimeParse$.class
                │   └── TimeParse.class
            └── wordSeqGenerator
                ├── ConsumerTest$$anon$1$$anonfun$run$1.class
                ├── ConsumerTest$$anon$1.class
                ├── ConsumerTest$.class
                ├── ConsumerTest$delayedInit$body.class
                ├── ConsumerTest.class
                ├── WordSeqProducer$$anonfun$1$$anonfun$apply$1.class
                ├── WordSeqProducer$$anonfun$1.class
                ├── WordSeqProducer$.class
                ├── WordSeqProducer$delayedInit$body.class
                └── WordSeqProducer.class


/README.md:
--------------------------------------------------------------------------------
1 | # spark streaming实时流运算
2 | sparkStreaming项目，1.日志分析系统 2. 舆情管控系统之实时词频统计处理子系统（包括中文分词服务器）3. 网站用户行为统计系统（ 只统计用户行为，建模预测后期实现） 4. 网站安全实时监控报警系统
3 | 


--------------------------------------------------------------------------------
/data/hanzi.txt:
--------------------------------------------------------------------------------
1 | 啊阿埃挨哎唉哀皑癌蔼矮艾碍爱隘鞍氨安俺按暗岸胺案肮昂盎凹敖熬翱袄傲奥懊澳芭捌扒叭吧笆八疤巴拔跋靶把耙坝霸罢爸白柏百摆佰败拜稗斑班搬扳般颁板版扮拌伴瓣半办绊邦帮梆榜膀绑棒磅蚌镑傍谤苞胞包褒剥薄雹保堡饱宝抱报暴豹鲍爆杯碑悲卑北辈背贝钡倍狈备惫焙被奔苯本笨崩绷甭泵蹦迸逼鼻比鄙笔彼碧蓖蔽毕毙毖币庇痹闭敝弊必辟壁臂避陛鞭边编贬扁便变卞辨辩辫遍标彪膘表鳖憋别瘪彬斌濒滨宾摈兵冰柄丙秉饼炳病并玻菠播拨钵波博勃搏铂箔伯帛舶脖膊渤泊驳捕卜哺补埠不布步簿部怖擦猜裁材才财睬踩采彩菜蔡餐参蚕残惭惨灿苍舱仓沧藏操糙槽曹草厕策侧册测层蹭插叉茬茶查碴搽察岔差诧拆柴豺搀掺蝉馋谗缠铲产阐颤昌猖场尝常长偿肠厂敞畅唱倡超抄钞朝嘲潮巢吵炒车扯撤掣彻澈郴臣辰尘晨忱沉陈趁衬撑称城橙成呈乘程惩澄诚承逞骋秤吃痴持匙池迟弛驰耻齿侈尺赤翅斥炽充冲虫崇宠抽酬畴踌稠愁筹仇绸瞅丑臭初出橱厨躇锄雏滁除楚础储矗搐触处揣川穿椽传船喘串疮窗幢床闯创吹炊捶锤垂春椿醇唇淳纯蠢戳绰疵茨磁雌辞慈瓷词此刺赐次聪葱囱匆从丛凑粗醋簇促蹿篡窜摧崔催脆瘁粹淬翠村存寸磋撮搓措挫错搭达答瘩打大呆歹傣戴带殆代贷袋待逮怠耽担丹单郸掸胆旦氮但惮淡诞弹蛋当挡党荡档刀捣蹈倒岛祷导到稻悼道盗德得的蹬灯登等瞪凳邓堤低滴迪敌笛狄涤翟嫡抵底地蒂第帝弟递缔颠掂滇碘点典靛垫电佃甸店惦奠淀殿碉叼雕凋刁掉吊钓调跌爹碟蝶迭谍叠丁盯叮钉顶鼎锭定订丢东冬董懂动栋侗恫冻洞兜抖斗陡豆逗痘都督毒犊独读堵睹赌杜镀肚度渡妒端短锻段断缎堆兑队对墩吨蹲敦顿囤钝盾遁掇哆多夺垛躲朵跺舵剁惰堕蛾峨鹅俄额讹娥恶厄扼遏鄂饿恩而儿耳尔饵洱二贰发罚筏伐乏阀法珐藩帆番翻樊矾钒繁凡烦反返范贩犯饭泛坊芳方肪房防妨仿访纺放菲非啡飞肥匪诽吠肺废沸费芬酚吩氛分纷坟焚汾粉奋份忿愤粪丰封枫蜂峰锋风疯烽逢冯缝讽奉凤佛否夫敷肤孵扶拂辐幅氟符伏俘服浮涪福袱弗甫抚辅俯釜斧脯腑府腐赴副覆赋复傅付阜父腹负富讣附妇缚咐噶嘎该改概钙盖溉干甘杆柑竿肝赶感秆敢赣冈刚钢缸肛纲岗港杠篙皋高膏羔糕搞镐稿告哥歌搁戈鸽胳疙割革葛格蛤阁隔铬个各给根跟耕更庚羹埂耿梗工攻功恭龚供躬公宫弓巩汞拱贡共钩勾沟苟狗垢构购够辜菇咕箍估沽孤姑鼓古蛊骨谷股故顾固雇刮瓜剐寡挂褂乖拐怪棺关官冠观管馆罐惯灌贯光广逛瑰规圭硅归龟闺轨鬼诡癸桂柜跪贵刽辊滚棍锅郭国果裹过哈骸孩海氦亥害骇酣憨邯韩含涵寒函喊罕翰撼捍旱憾悍焊汗汉夯杭航壕嚎豪毫郝好耗号浩呵喝荷菏核禾和何合盒貉阂河涸赫褐鹤贺嘿黑痕很狠恨哼亨横衡恒轰哄烘虹鸿洪宏弘红喉侯猴吼厚候后呼乎忽瑚壶葫胡蝴狐糊湖弧虎唬护互沪户花哗华猾滑画划化话槐徊怀淮坏欢环桓还缓换患唤痪豢焕涣宦幻荒慌黄磺蝗簧皇凰惶煌晃幌恍谎灰挥辉徽恢蛔回毁悔慧卉惠晦贿秽会烩汇讳诲绘荤昏婚魂浑混豁活伙火获或惑霍货祸击圾基机畸稽积箕肌饥迹激讥鸡姬绩缉吉极棘辑籍集及急疾汲即嫉级挤几脊己蓟技冀季伎祭剂悸济寄寂计记既忌际妓继纪嘉枷夹佳家加荚颊贾甲钾假稼价架驾嫁歼监坚尖笺间煎兼肩艰奸缄茧检柬碱碱拣捡简俭剪减荐槛鉴践贱见键箭件健舰剑饯渐溅涧建僵姜将浆江疆蒋桨奖讲匠酱降蕉椒礁焦胶交郊浇骄娇嚼搅铰矫侥脚狡角饺缴绞剿教酵轿较叫窖揭接皆秸街阶截劫节桔杰捷睫竭洁结解姐戒藉芥界借介疥诫届巾筋斤金今津襟紧锦仅谨进靳晋禁近烬浸尽劲荆兢茎睛晶鲸京惊精粳经井警景颈静境敬镜径痉靖竟竞净炯窘揪究纠玖韭久灸九酒厩救旧臼舅咎就疚鞠拘狙疽居驹菊局咀矩举沮聚拒据巨具距踞锯俱句惧炬剧捐鹃娟倦眷卷绢撅攫抉掘倔爵觉决诀绝均菌钧军君峻俊竣浚郡骏喀咖卡咯开揩楷凯慨刊堪勘坎砍看康慷糠扛抗亢炕考拷烤靠坷苛柯棵磕颗科壳咳可渴克刻客课肯啃垦恳坑吭空恐孔控抠口扣寇枯哭窟苦酷库裤夸垮挎跨胯块筷侩快宽款匡筐狂框矿眶旷况亏盔岿窥葵奎魁傀馈愧溃坤昆捆困括扩廓阔垃拉喇蜡腊辣啦莱来赖蓝婪栏拦篮阑兰澜谰揽览懒缆烂滥琅榔狼廊郎朗浪捞劳牢老佬姥酪烙涝勒乐雷镭蕾磊累儡垒擂肋类泪棱楞冷厘梨犁黎篱狸离漓理李里鲤礼莉荔吏栗丽厉励砾历利僳例俐痢立粒沥隶力璃哩俩联莲连镰廉怜涟帘敛脸链恋炼练粮凉梁粱良两辆量晾亮谅撩聊僚疗燎寥辽潦了撂镣廖料列裂烈劣猎琳林磷霖临邻鳞淋凛赁吝拎玲菱零龄铃伶羚凌灵陵岭领另令溜琉榴硫馏留刘瘤流柳六龙聋咙笼窿隆垄拢陇楼娄搂篓漏陋芦卢颅庐炉掳卤虏鲁麓碌露路赂鹿潞禄录陆戮驴吕铝侣旅履屡缕虑氯律率滤绿峦挛孪滦卵乱掠略抡轮伦仑沦纶论萝螺罗逻锣箩骡裸落洛骆络妈麻玛码蚂马骂嘛吗埋买麦卖迈脉瞒馒蛮满蔓曼慢漫谩芒茫盲氓忙莽猫茅锚毛矛铆卯茂冒帽貌贸么玫枚梅酶霉煤没眉媒镁每美昧寐妹媚门闷们萌蒙檬盟锰猛梦孟眯醚靡糜迷谜弥米秘觅泌蜜密幂棉眠绵冕免勉娩缅面苗描瞄藐秒渺庙妙蔑灭民抿皿敏悯闽明螟鸣铭名命谬摸摹蘑模膜磨摩魔抹末莫墨默沫漠寞陌谋牟某拇牡亩姆母墓暮幕募慕木目睦牧穆拿哪呐钠那娜纳氖乃奶耐奈南男难囊挠脑恼闹淖呢馁内嫩能妮霓倪泥尼拟你匿腻逆溺蔫拈年碾撵捻念娘酿鸟尿捏聂孽啮镊镍涅您柠狞凝宁拧泞牛扭钮纽脓浓农弄奴努怒女暖虐疟挪懦糯诺哦欧鸥殴藕呕偶沤啪趴爬帕怕琶拍排牌徘湃派攀潘盘磐盼畔判叛乓庞旁耪胖抛咆刨炮袍跑泡呸胚培裴赔陪配佩沛喷盆砰抨烹澎彭蓬棚硼篷膨朋鹏捧碰坯砒霹批披劈琵毗啤脾疲皮匹痞僻屁譬篇偏片骗飘漂瓢票撇瞥拼频贫品聘乒坪苹萍平凭瓶评屏坡泼颇婆破魄迫粕剖扑铺仆莆葡菩蒲埔朴圃普浦谱曝瀑期欺栖戚妻七凄漆柒沏其棋奇歧畦崎脐齐旗祈祁骑起岂乞企启契砌器气迄弃汽泣讫掐洽牵扦钎铅千迁签仟谦乾黔钱钳前潜遣浅谴堑嵌欠歉枪呛腔羌墙蔷强抢橇锹敲悄桥瞧乔侨巧鞘撬翘峭俏窍切茄且怯窃钦侵亲秦琴勤芹擒禽寝沁青轻氢倾卿清擎晴氰情顷请庆琼穷秋丘邱球求囚酋泅趋区蛆曲躯屈驱渠取娶龋趣去圈颧权醛泉全痊拳犬券劝缺炔瘸却鹊榷确雀裙群然燃冉染瓤壤攘嚷让饶扰绕惹热壬仁人忍韧任认刃妊纫扔仍日戎茸蓉荣融熔溶容绒冗揉柔肉茹蠕儒孺如辱乳汝入褥软阮蕊瑞锐闰润若弱撒洒萨腮鳃塞赛三叁伞散桑嗓丧搔骚扫嫂瑟色涩森僧莎砂杀刹沙纱傻啥煞筛晒珊苫杉山删煽衫闪陕擅赡膳善汕扇缮墒伤商赏晌上尚裳梢捎稍烧芍勺韶少哨邵绍奢赊蛇舌舍赦摄射慑涉社设砷申呻伸身深娠绅神沈审婶甚肾慎渗声生甥牲升绳省盛剩胜圣师失狮施湿诗尸虱十石拾时什食蚀实识史矢使屎驶始式示士世柿事拭誓逝势是嗜噬适仕侍释饰氏市恃室视试收手首守寿授售受瘦兽蔬枢梳殊抒输叔舒淑疏书赎孰熟薯暑曙署蜀黍鼠属术述树束戍竖墅庶数漱恕刷耍摔衰甩帅栓拴霜双爽谁水睡税吮瞬顺舜说硕朔烁斯撕嘶思私司丝死肆寺嗣四伺似饲巳松耸怂颂送宋讼诵搜艘擞嗽苏酥俗素速粟僳塑溯宿诉肃酸蒜算虽隋随绥髓碎岁穗遂隧祟孙损笋蓑梭唆缩琐索锁所塌他它她塔獭挞蹋踏胎苔抬台泰酞太态汰坍摊贪瘫滩坛檀痰潭谭谈坦毯袒碳探叹炭汤塘搪堂棠膛唐糖倘躺淌趟烫掏涛滔绦萄桃逃淘陶讨套特藤腾疼誊梯剔踢锑提题蹄啼体替嚏惕涕剃屉天添填田甜恬舔腆挑条迢眺跳贴铁帖厅听烃汀廷停亭庭艇通桐酮瞳同铜彤童桶捅筒统痛偷投头透凸秃突图徒途涂屠土吐兔湍团推颓腿蜕褪退吞屯臀拖托脱鸵陀驮驼椭妥拓唾挖哇蛙洼娃瓦袜歪外豌弯湾玩顽丸烷完碗挽晚皖惋宛婉万腕汪王亡枉网往旺望忘妄威巍微危韦违桅围唯惟为潍维苇萎委伟伪尾纬未蔚味畏胃喂魏位渭谓尉慰卫瘟温蚊文闻纹吻稳紊问嗡翁瓮挝蜗涡窝我斡卧握沃巫呜钨乌污诬屋无芜梧吾吴毋武五捂午舞伍侮坞戊雾晤物勿务悟误昔熙析西硒矽晰嘻吸锡牺稀息希悉膝夕惜熄烯溪汐犀檄袭席习媳喜铣洗系隙戏细瞎虾匣霞辖暇峡侠狭下厦夏吓掀锨先仙鲜纤咸贤衔舷闲涎弦嫌显险现献县腺馅羡宪陷限线相厢镶香箱襄湘乡翔祥详想响享项巷橡像向象萧硝霄削哮嚣销消宵淆晓小孝校肖啸笑效楔些歇蝎鞋协挟携邪斜胁谐写械卸蟹懈泄泻谢屑薪芯锌欣辛新忻心信衅星腥猩惺兴刑型形邢行醒幸杏性姓兄凶胸匈汹雄熊休修羞朽嗅锈秀袖绣墟戌需虚嘘须徐许蓄酗叙旭序畜恤絮婿绪续轩喧宣悬旋玄选癣眩绚靴薛学穴雪血勋熏循旬询寻驯巡殉汛训讯逊迅压押鸦鸭呀丫芽牙蚜崖衙涯雅哑亚讶焉咽阉烟淹盐严研蜒岩延言颜阎炎沿奄掩眼衍演艳堰燕厌砚雁唁彦焰宴谚验殃央鸯秧杨扬佯疡羊洋阳氧仰痒养样漾邀腰妖瑶摇尧遥窑谣姚咬舀药要耀椰噎耶爷野冶也页掖业叶曳腋夜液一壹医揖铱依伊衣颐夷遗移仪胰疑沂宜姨彝椅蚁倚已乙矣以艺抑易邑屹亿役臆逸肄疫亦裔意毅忆义益溢诣议谊译异翼翌绎茵荫因殷音阴姻吟银淫寅饮尹引隐印英樱婴鹰应缨莹萤营荧蝇迎赢盈影颖硬映哟拥佣臃痈庸雍踊蛹咏泳涌永恿勇用幽优悠忧尤由邮铀犹油游酉有友右佑釉诱又幼迂淤于盂榆虞愚舆余俞逾鱼愉渝渔隅予娱雨与屿禹宇语羽玉域芋郁吁遇喻峪御愈欲狱育誉浴寓裕预豫驭鸳渊冤元垣袁原援辕园员圆猿源缘远苑愿怨院曰约越跃钥岳粤月悦阅耘云郧匀陨允运蕴酝晕韵孕匝砸杂栽哉灾宰载再在咱攒暂赞赃脏葬遭糟凿藻枣早澡蚤躁噪造皂灶燥责择则泽贼怎增憎曾赠扎喳渣札轧铡闸眨栅榨咋乍炸诈摘斋宅窄债寨瞻毡詹粘沾盏斩辗崭展蘸栈占战站湛绽樟章彰漳张掌涨杖丈帐账仗胀瘴障招昭找沼赵照罩兆肇召遮折哲蛰辙者锗蔗这浙珍斟真甄砧臻贞针侦枕疹诊震振镇阵蒸挣睁征狰争怔整拯正政帧症郑证芝枝支吱蜘知肢脂汁之织职直植殖执值侄址指止趾只旨纸志挚掷至致置帜峙制智秩稚质炙痔滞治窒中盅忠钟衷终种肿重仲众舟周州洲诌粥轴肘帚咒皱宙昼骤珠株蛛朱猪诸诛逐竹烛煮拄瞩嘱主著柱助蛀贮铸筑住注祝驻抓爪拽专砖转撰赚篆桩庄装妆撞壮状椎锥追赘坠缀谆准捉拙卓桌琢茁酌啄着灼浊兹咨资姿滋淄孜紫仔籽滓子自渍字鬃棕踪宗综总纵邹走奏揍租足卒族祖诅阻组钻纂嘴醉最罪尊遵昨左佐柞做作坐座


--------------------------------------------------------------------------------
/logs/20200606215836.log:
--------------------------------------------------------------------------------
 1 | [info]	main	Date:20200606215836
 2 | [debug]	main	Date:20200606215836
 3 | [error]	calculate	Date:20200606215836
 4 | [info]	saveFile	Date:20200606215836
 5 | [warn]	main	Date:20200606215836
 6 | [info]	main	Date:20200606215836
 7 | [error]	calculate	Date:20200606215836
 8 | [debug]	calculate	Date:20200606215836
 9 | [error]	main	Date:20200606215836
10 | [error]	openFile	Date:20200606215836
11 | 


--------------------------------------------------------------------------------
/logs/20200606215839.log:
--------------------------------------------------------------------------------
 1 | [info]	openFile	Date:20200606215839
 2 | [error]	calculate	Date:20200606215839
 3 | [info]	saveFile	Date:20200606215839
 4 | [info]	saveFile	Date:20200606215839
 5 | [debug]	main	Date:20200606215839
 6 | [warn]	main	Date:20200606215839
 7 | [warn]	openFile	Date:20200606215839
 8 | [warn]	saveFile	Date:20200606215839
 9 | [info]	openFile	Date:20200606215839
10 | [warn]	saveFile	Date:20200606215839
11 | 


--------------------------------------------------------------------------------
/logs/20200606215842.log:
--------------------------------------------------------------------------------
 1 | [debug]	main	Date:20200606215842
 2 | [error]	openFile	Date:20200606215842
 3 | [info]	saveFile	Date:20200606215842
 4 | [warn]	saveFile	Date:20200606215842
 5 | [info]	calculate	Date:20200606215842
 6 | [error]	saveFile	Date:20200606215842
 7 | [error]	openFile	Date:20200606215842
 8 | [error]	main	Date:20200606215842
 9 | [error]	saveFile	Date:20200606215842
10 | [warn]	saveFile	Date:20200606215842
11 | 


--------------------------------------------------------------------------------
/logs/20200606215845.log:
--------------------------------------------------------------------------------
 1 | [error]	main	Date:20200606215845
 2 | [error]	openFile	Date:20200606215845
 3 | [error]	main	Date:20200606215845
 4 | [warn]	calculate	Date:20200606215845
 5 | [warn]	saveFile	Date:20200606215845
 6 | [warn]	calculate	Date:20200606215845
 7 | [debug]	saveFile	Date:20200606215845
 8 | [error]	main	Date:20200606215845
 9 | [info]	openFile	Date:20200606215845
10 | [error]	main	Date:20200606215845
11 | 


--------------------------------------------------------------------------------
/logs/20200606215846.log:
--------------------------------------------------------------------------------
 1 | [debug]	saveFile	Date:20200606215846
 2 | [warn]	main	Date:20200606215846
 3 | [warn]	calculate	Date:20200606215846
 4 | [warn]	calculate	Date:20200606215846
 5 | [info]	openFile	Date:20200606215846
 6 | [debug]	openFile	Date:20200606215846
 7 | [warn]	openFile	Date:20200606215846
 8 | [error]	saveFile	Date:20200606215846
 9 | [warn]	openFile	Date:20200606215846
10 | [debug]	saveFile	Date:20200606215846
11 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.yc.spark74</groupId>
  8 |     <artifactId>streaming74projects</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 |     <properties>
 11 |         <maven.compiler.source>1.8</maven.compiler.source>
 12 |         <maven.compiler.target>1.8</maven.compiler.target>
 13 |         <scala.version>2.11.8</scala.version>
 14 |         <spark.version>2.2.0</spark.version>
 15 |         <hadoop.version>2.6.5</hadoop.version>
 16 | 
 17 | 
 18 |         <encoding>UTF-8</encoding>
 19 |     </properties>
 20 | 
 21 |     <dependencies>
 22 | 
 23 |         <dependency>
 24 |             <groupId>org.apache.spark</groupId>
 25 |             <artifactId>spark-sql_2.11</artifactId>
 26 |             <version>${spark.version}</version>
 27 | 
 28 | 
 29 |         </dependency>
 30 | 
 31 |         <dependency> <!-- Spark Streaming -->
 32 |             <groupId>org.apache.spark</groupId>
 33 |             <artifactId>spark-streaming_2.11</artifactId>
 34 |             <version>${spark.version}</version>
 35 |         </dependency>
 36 | 
 37 |         <dependency>
 38 |             <groupId>org.apache.spark</groupId>
 39 |             <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
 40 |             <version>${spark.version}</version>
 41 |         </dependency>
 42 | 
 43 | 
 44 |         <dependency>
 45 |             <groupId>org.apache.kafka</groupId>
 46 |             <artifactId>kafka_2.11</artifactId>
 47 |             <!--<version>0.10.1.0</version> -->
 48 |             <version>2.2.0</version>
 49 |             <!--
 50 |             <exclusions>
 51 |                 <exclusion>
 52 |                     <artifactId>jmxri</artifactId>
 53 |                     <groupId>com.sun.jmx</groupId>
 54 |                 </exclusion>
 55 |                 <exclusion>
 56 |                     <artifactId>jmxtools</artifactId>
 57 |                     <groupId>com.sun.jdmk</groupId>
 58 |                 </exclusion>
 59 |                 <exclusion>
 60 |                     <artifactId>jms</artifactId>
 61 |                     <groupId>javax.jms</groupId>
 62 |                 </exclusion>
 63 |                 <exclusion>
 64 |                     <artifactId>junit</artifactId>
 65 |                     <groupId>junit</groupId>
 66 |                 </exclusion>
 67 |             </exclusions>
 68 |             -->
 69 |         </dependency>
 70 | 
 71 | 
 72 |         <dependency>
 73 |             <groupId>mysql</groupId>
 74 |             <artifactId>mysql-connector-java</artifactId>
 75 |             <version>8.0.20</version>
 76 |         </dependency>
 77 | 
 78 |         <!-- 导入scala的依赖 -->
 79 |         <dependency>
 80 |             <groupId>org.scala-lang</groupId>
 81 |             <artifactId>scala-library</artifactId>
 82 |             <version>${scala.version}</version>
 83 |         </dependency>
 84 | 
 85 |         <!-- 导入spark的依赖 -->
 86 |         <dependency>
 87 |             <groupId>org.apache.spark</groupId>
 88 |             <artifactId>spark-core_2.11</artifactId>
 89 |             <version>${spark.version}</version>
 90 |         </dependency>
 91 | 
 92 |         <!-- 指定hadoop-client API的版本 -->
 93 |         <dependency>
 94 |             <groupId>org.apache.hadoop</groupId>
 95 |             <artifactId>hadoop-client</artifactId>
 96 |             <version>${hadoop.version}</version>
 97 |         </dependency>
 98 | 
 99 | 
100 |         <!--数据库联接池 -->
101 |         <dependency>
102 |             <groupId>c3p0</groupId>
103 |             <artifactId>c3p0</artifactId>
104 |             <version>0.9.1.2</version>
105 |         </dependency>
106 |         <dependency>
107 |             <groupId>com.alibaba</groupId>
108 |             <artifactId>druid</artifactId>
109 |             <version>1.0.18</version>
110 |         </dependency>
111 | 
112 |         <!-- 分词服务是以微服务提供的，所以引入JSON和http -->
113 |         <dependency>
114 |             <groupId>io.spray</groupId>
115 |             <artifactId>spray-json_2.10</artifactId>
116 |             <version>1.3.2</version>
117 |         </dependency>
118 |         <dependency><!-- Http -->
119 |             <groupId>org.scalaj</groupId>
120 |             <artifactId>scalaj-http_2.10</artifactId>
121 |             <version>2.3.0</version>
122 |         </dependency>
123 | 
124 |         <dependency><!-- Time Parse 时间解析 -->
125 |             <groupId>joda-time</groupId>
126 |             <artifactId>joda-time</artifactId>
127 |             <version>2.9.4</version>
128 |         </dependency>
129 | 
130 | 
131 |     </dependencies>
132 | 
133 |     <build>
134 |         <pluginManagement>
135 |             <plugins>
136 |                 <!-- 编译scala的插件 -->
137 |                 <plugin>
138 |                     <groupId>net.alchim31.maven</groupId>
139 |                     <artifactId>scala-maven-plugin</artifactId>
140 |                     <version>3.2.2</version>
141 |                 </plugin>
142 |                 <!-- 编译java的插件 -->
143 |                 <plugin>
144 |                     <groupId>org.apache.maven.plugins</groupId>
145 |                     <artifactId>maven-compiler-plugin</artifactId>
146 |                     <version>3.5.1</version>
147 |                 </plugin>
148 |             </plugins>
149 |         </pluginManagement>
150 |         <plugins>
151 |             <plugin>
152 |                 <groupId>net.alchim31.maven</groupId>
153 |                 <artifactId>scala-maven-plugin</artifactId>
154 |                 <executions>
155 |                     <execution>
156 |                         <id>scala-compile-first</id>
157 |                         <phase>process-resources</phase>
158 |                         <goals>
159 |                             <goal>add-source</goal>
160 |                             <goal>compile</goal>
161 |                         </goals>
162 |                     </execution>
163 |                     <execution>
164 |                         <id>scala-test-compile</id>
165 |                         <phase>process-test-resources</phase>
166 |                         <goals>
167 |                             <goal>testCompile</goal>
168 |                         </goals>
169 |                     </execution>
170 |                 </executions>
171 |             </plugin>
172 | 
173 |             <plugin>
174 |                 <groupId>org.apache.maven.plugins</groupId>
175 |                 <artifactId>maven-compiler-plugin</artifactId>
176 |                 <executions>
177 |                     <execution>
178 |                         <phase>compile</phase>
179 |                         <goals>
180 |                             <goal>compile</goal>
181 |                         </goals>
182 |                     </execution>
183 |                 </executions>
184 |             </plugin>
185 | 
186 | 
187 |             <!-- 打jar插件 -->
188 |             <plugin>
189 |                 <groupId>org.apache.maven.plugins</groupId>
190 |                 <artifactId>maven-shade-plugin</artifactId>
191 |                 <version>2.4.3</version>
192 |                 <executions>
193 |                     <execution>
194 |                         <phase>package</phase>
195 |                         <goals>
196 |                             <goal>shade</goal>
197 |                         </goals>
198 |                         <configuration>
199 |                             <filters>
200 |                                 <filter>
201 |                                     <artifact>*:*</artifact>
202 |                                     <excludes>
203 |                                         <exclude>META-INF/*.SF</exclude>
204 |                                         <exclude>META-INF/*.DSA</exclude>
205 |                                         <exclude>META-INF/*.RSA</exclude>
206 |                                     </excludes>
207 |                                 </filter>
208 |                             </filters>
209 |                         </configuration>
210 |                     </execution>
211 |                 </executions>
212 |             </plugin>
213 |         </plugins>
214 |     </build>
215 | 
216 | </project>


--------------------------------------------------------------------------------
/src/main/java/LogGenerator.java:
--------------------------------------------------------------------------------
 1 | import java.io.File;
 2 | import java.io.FileOutputStream;
 3 | import java.io.IOException;
 4 | import java.text.DateFormat;
 5 | import java.text.SimpleDateFormat;
 6 | import java.util.Date;
 7 | import java.util.Random;
 8 | 
 9 | public class LogGenerator {
10 | 
11 |     public static final String FILE_PATH = "./logs/";//文件指定存放的路径
12 | 
13 |     public static void main(String[] args) throws IOException {
14 |         FileOutputStream outFile = null;
15 |         DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
16 |         Random r = new Random();
17 |         String grades[] = {"[info]", "[warn]", "[error]", "[debug]"};
18 |         String position[] = {"main", "calculate", "saveFile", "openFile"};
19 |         for (int i = 0; i < 5; i++) {
20 |             System.out.println(i);
21 |             String filename = df.format(new Date()) + ".log";
22 |             File file = creatFile(FILE_PATH, filename);
23 |             try {
24 |                 outFile = new FileOutputStream(file);
25 |                 for (int j = 0; j < 10; j++) {
26 |                     // 日志格式:    [级别]\t位置\tDate:时间\n
27 |                     String log = grades[r.nextInt(grades.length)] + "\t" + position[r.nextInt(position.length)] + "\tDate:" + df.format(new Date()) + "\n";
28 |                     outFile.write(log.getBytes());
29 |                 }
30 |                 outFile.flush();
31 |                 Thread.sleep(r.nextInt(2000)+1000);     //   注意生成的时间  ，上面的文件名是到秒  ，下面的  r.nextInt( xx )有可能是豪秒。
32 |             } catch (Exception e) {
33 |                 e.printStackTrace();
34 |             } finally {
35 |                 if (outFile != null) {
36 |                     outFile.close();
37 |                 }
38 |             }
39 |         }
40 |     }
41 | 
42 |     public static File creatFile(String filePath, String fileName) {
43 |         File folder = new File(filePath);
44 |         //文件夹路径不存在
45 |         if (!folder.exists() && !folder.isDirectory()) {
46 |             System.out.println("文件夹路径不存在，创建路径:" + filePath);
47 |             folder.mkdirs();
48 |         }
49 |         // 如果文件不存在就创建
50 |         File file = new File(filePath + fileName);
51 |         if (!file.exists()) {
52 |             System.out.println("文件不存在，创建文件:" + filePath + fileName);
53 |             try {
54 |                 file.createNewFile();
55 |             } catch (IOException e) {
56 |                 e.printStackTrace();
57 |             }
58 |         }
59 |         return file;
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | ### 配置根 ###
 2 | log4j.rootLogger = info,console
 3 | 
 4 | 
 5 | 
 6 | ### 设置输出sql的级别，其中logger后面的内容全部为jar包中所包含的包名 ###
 7 | log4j.logger.org.apache=error
 8 | log4j.logger.java.sql.Connection=error
 9 | log4j.logger.java.sql.Statement=info
10 | log4j.logger.java.sql.PreparedStatement=info
11 | log4j.logger.java.sql.ResultSet=error
12 | 
13 | ### 配置输出到控制台 ###
14 | log4j.appender.console = org.apache.log4j.ConsoleAppender
15 | log4j.appender.console.Target = System.out
16 | log4j.appender.console.layout = org.apache.log4j.PatternLayout
17 | log4j.appender.console.layout.ConversionPattern =  %d{ABSOLUTE} %5p %c:%L - %m%n
18 | 
19 | 


--------------------------------------------------------------------------------
/src/main/scala/project1_logAnalysis/LogAnalysis.scala:
--------------------------------------------------------------------------------
  1 | package project1_logAnalysis
  2 | 
  3 | import java.util.Properties
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | import org.apache.spark.{HashPartitioner, SparkConf}
  7 | import org.apache.spark.rdd.RDD
  8 | import org.apache.spark.sql.SaveMode
  9 | import org.apache.spark.sql.SparkSession
 10 | import org.apache.spark.sql.types.StringType
 11 | import org.apache.spark.sql.types.StructField
 12 | import org.apache.spark.sql.types.StructType
 13 | import org.apache.spark.streaming.Seconds
 14 | import org.apache.spark.streaming.StreamingContext
 15 | import org.apache.spark.streaming.dstream.DStream
 16 | 
 17 | 
 18 | case class Record(log_level: String, method: String, content: String)
 19 | 
 20 | object LogAnalysis {
 21 |   def main(args: Array[String]): Unit = {
 22 |     Logger.getLogger("org").setLevel(Level.ERROR) //配置日志
 23 |     val sparkConf = new SparkConf()
 24 |       .setAppName("LogAnalysis")
 25 |       .setMaster("local[*]")
 26 |     //因为要用到spark sql , 创建 SparkSession
 27 |     val spark = SparkSession.builder()
 28 |       .appName("LogAnalysis")
 29 |       .config(sparkConf)
 30 |       .getOrCreate()
 31 |     //利用sparkSession创建上下文
 32 |     val sc = spark.sparkContext
 33 |     //建立流式处理上下文  spark Streaming
 34 |     val ssc = new StreamingContext(sc, Seconds(2))
 35 | 
 36 |     ssc.checkpoint("./chpoint")
 37 | 
 38 |     //以上都表明在一个程序中，只能创建一个与spark的联接
 39 | 
 40 |     // Mysql配置
 41 |     val properties = new Properties()
 42 |     properties.setProperty("user", "root")
 43 |     properties.setProperty("password", "a")
 44 | 
 45 |     // 读入日志文件目录下的日志信息流
 46 |     val logStream = ssc.textFileStream("./logs/")
 47 | 
 48 |     // 从DStream中取出每个RDD, 将日志RDD信息流转换为dataframe
 49 |     logStream.foreachRDD((rdd: RDD[String]) => {
 50 |       import spark.implicits._
 51 |       val data = rdd.map(w => {
 52 |         val tokens = w.split("\t")
 53 |         Record(tokens(0), tokens(1), tokens(2))
 54 |       }).toDF()
 55 |       //println(  "本次RDD取到数据条数:"+data.count() )
 56 |       //data.show()
 57 |       //创建视图
 58 |       data.createOrReplaceTempView("alldata")
 59 | 
 60 |       // 条件筛选:只查看  error和 warn信息
 61 |       val logImp = spark.sql("select * from alldata where log_level='[error]'")
 62 |       logImp.show()
 63 |       // 输出到外部Mysql中
 64 |       //      //利用 spark sql将结果保存到外部mysql中       DataFrame.read        DataFrame.write
 65 |       logImp.write.mode(SaveMode.Append)
 66 |         .jdbc("jdbc:mysql://localhost:3306/log_analysis", "important_logs", properties)
 67 |     })
 68 |     //  [级别]\t位置\t信息                                    ( 级别,1)
 69 |     val cached=logStream.map(_.split("\t")).map(   arr=>(arr(0),1)).cache()
 70 |     //有状态的操作
 71 |     val result=cached.updateStateByKey( updateFunc, new HashPartitioner(  ssc.sparkContext.defaultMinPartitions  ),  true )
 72 |     result.print()
 73 | 
 74 |     val r2=cached.reduceByKeyAndWindow( (a:Int,b:Int)=>a+b, Seconds(4)  , Seconds( 2) )
 75 |     //r2.print()
 76 |     printValues(  r2 )
 77 | 
 78 | 
 79 |     //r2.print()
 80 |     ssc.start()
 81 |     ssc.awaitTermination()
 82 |   }
 83 | 
 84 |   val updateFunc= (   iter:Iterator[ (String,Seq[Int] ,  Option[Int] ) ] ) =>{
 85 |     //方案一:当成一个三元组运算
 86 |     // iter.map(    t=> (  t._1,   t._2.sum+t._3.getOrElse(0)  )    )   //  ->  { word：总次数}
 87 |     //方案二: 模式匹配来实现
 88 |     iter.map{  case(x,y,z)=>(  x,  y.sum+z.getOrElse(0)   ) }
 89 |   }
 90 | 
 91 | 
 92 |   //定义一个打印函数，打印RDD中所有的元素
 93 |   def printValues(stream: DStream[(String, Int)]) {     //    DStream -> n个RDD组成  -> 一个RDD由n 条记录组成  -》一条记录由  (String, Int) 组成
 94 |     stream.foreachRDD(foreachFunc) //   不要用foreach()  ->  foreachRDD
 95 |     def foreachFunc = (rdd: RDD[(String, Int)]) => {
 96 |       val array = rdd.collect()   //采集 worker端的结果传到driver端.
 97 |       println("===============window窗口===============")
 98 |       for (res <- array) {
 99 |         println(res)
100 |       }
101 |       println("===============window窗口===============")
102 | 
103 |     }
104 |   }
105 | 
106 | 
107 |   /*
108 |  结果分析:
109 |      批处理时间间隔: 2秒
110 | 窗口长度: 4秒
111 | 滑动时间间隔: 2秒
112 | 
113 | 
114 | 
115 | 
116 | -> 6个Error
117 | state: 6个error
118 | window: 6个error
119 | 
120 | 
121 | ->
122 | state: 6个error
123 | window: 6个error
124 | 
125 | -> 增加3个error
126 | state: 6+3 个error
127 | window:  3个error
128 | 
129 |    */
130 | }
131 | 


--------------------------------------------------------------------------------
/src/main/scala/project1_logAnalysis/db.sql:
--------------------------------------------------------------------------------
 1 | create database log_analysis;
 2 | 
 3 | --需求1: 筛选出所有的 error,利用spark sql 转存数据库 
 4 | --   id       level       method      content
 5 | use log_analysis;
 6 | 
 7 | CREATE TABLE `important_logs` (
 8 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 9 |   `log_level` varchar(255) NOT NULL,
10 |   `method` varchar(255) NOT NULL,
11 |   `content` varchar(500) NOT NULL,
12 |   PRIMARY KEY (`id`)
13 | ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8;
14 | 
15 | 
16 | --需求二: 累计这个级别 日志出现的总次数,不需要表，只直接输出
17 | 
18 | --需求三: 在以 1秒为批处理时间间隔, 这个级别在过去的3个时间窗口内，每两个 slide intervals的次数，输出.
19 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/TestMain.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts
 2 | 
 3 | import scala.collection.mutable.Map
 4 | import scala.collection.mutable.HashSet
 5 | import org.apache.log4j.{Level, LogManager, Logger}
 6 | import org.apache.spark.SparkConf
 7 | import kafka.serializer.StringDecoder
 8 | import org.apache.kafka.common.serialization.StringDeserializer
 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming._
11 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
12 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils}
13 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
14 | import project2_wordcounts.streamingWC.service.{MysqlService, SegmentService}
15 | import project2_wordcounts.streamingWC.utils.{BroadcastWrapper, Conf}
16 | import spray.json._
17 | 
18 | //主程序类
19 | object TestMain extends Serializable {
20 | 
21 |   @transient lazy val log = LogManager.getRootLogger
22 | 
23 |   def main(args: Array[String]) {
24 |     Logger.getLogger("org").setLevel(Level.ERROR) //配置日志
25 |     val sparkConf = new SparkConf().setAppName("WordFreqConsumer")
26 |       .setMaster(Conf.master)
27 |       //.set("spark.executor.memory", Conf.executorMem)
28 |       //.set("spark.cores.max", Conf.coresMax)
29 |       //.set("spark.local.dir", Conf.localDir)
30 |       .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
31 |       //.set("spark.streaming.kafka.maxRatePerPartition", Conf.perMaxRate) //设定对目标topic每个partition每秒钟拉取的数据条数
32 |     val ssc = new StreamingContext(sparkConf, Seconds(Conf.interval))
33 |     ssc.checkpoint("./chpoint") // window函数和有状态操作一定要设置
34 |     // 创建直联
35 |     // auto.offset.reset 值:
36 |     //          earliest:当各分区下有已提交的offset时，从提交的offset开始消费；无提交的offset时，从头开始消费
37 |     //          latest:当各分区下有已提交的offset时，从提交的offset开始消费；无提交的offset时，消费新产生的该分区下的数据
38 |     //          none: topic各分区都存在已提交的offset时，从offset后开始消费；只要有一个分区不存在已提交的offset，则抛出异常
39 |     val topicsSet = Conf.topics.split(",").toSet
40 |     val kafkaParams = scala.collection.immutable.Map[String, Object]("bootstrap.servers" -> Conf.brokers,
41 |              "key.deserializer" -> classOf[StringDeserializer],
42 |              "value.deserializer" -> classOf[StringDeserializer],
43 |              "auto.offset.reset" -> "latest",
44 |               "group.id" -> Conf.group,
45 |               "enable.auto.commit" -> (true: java.lang.Boolean))
46 |     /*
47 |     kafka本地策略（  LocationStrategies ):
48 |     1. PreferConsistent : 在可用的 executors 上均匀分布分区
49 |     2. PreferBrokers: 如果 executor 与 Kafka 的代理节点在同一台物理机上，使用 PreferBrokers，会更倾向于在该节点上安排 KafkaLeader 对应的分区,以减少数据的网络传输。
50 |     3. PreferFixed： 如果发生分区之间数据负载倾斜，使用 PreferFixed。可以指定分区和主机之间的映射（任何未指定的分区将使用相同的位置）
51 |      */
52 |     //    Subscribe:主题名固定,   SubscribePattern:主题名由正则表示    ,   Assign:固定主题和分区
53 |     val kafkaDirectStream = KafkaUtils.createDirectStream[String, String](
54 |       ssc,
55 |       PreferConsistent,
56 |       Subscribe[String, String](topicsSet, kafkaParams)
57 |     )
58 |     log.warn(s"初始化联接完成***>>>topic:${Conf.topics}   group:${Conf.group} localDir:${Conf.localDir} brokers:${Conf.brokers}")
59 | 
60 |     kafkaDirectStream.cache     ///缓存
61 |     //kafkaDirectStream.print()
62 |     //读取  用户词典库，加载成广播变量
63 |     val words = BroadcastWrapper[(Long, HashSet[String])](ssc, (System.currentTimeMillis, MysqlService.getUserWords))
64 | 
65 |     //经过分词得到新的stream,               _指的是  kafka中的 (k,v)
66 |     //        repartition:重分区
67 |     val segmentedStream = kafkaDirectStream.map(_.value).repartition(10).transform(rdd => {
68 |       if (System.currentTimeMillis - words.value._1 > Conf.updateFreq) {   //   更新频率  300000 //5min
69 |         words.update((System.currentTimeMillis, MysqlService.getUserWords), true)
70 |         log.warn("[BroadcastWrapper] 用户词典中单词更新了 ")
71 |       }
72 |       //   rdd中是消息( 语句)        调用分词服务，并传递  待争分的语句及用户词典
73 |       rdd.flatMap(record => SegmentService.mapSegment(record, words.value._2))
74 |     })
75 | 
76 |     //  ( word, 1 )
77 | 
78 |     //以entity_timestamp_beeword为key,统计本batch内各个key的计数
79 |     val countedStream = segmentedStream.reduceByKey(_ + _)
80 | 
81 |     // countedStream  (单词，总和)
82 |     countedStream.foreachRDD(MysqlService.save(  _  ))
83 | 
84 |     ssc.start()
85 |     ssc.awaitTermination()
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/db.sql:
--------------------------------------------------------------------------------
 1 | -- ----------------------------
 2 | --  Table structure for `user_words`
 3 | -- ----------------------------
 4 | DROP TABLE IF EXISTS `user_words`;
 5 | CREATE TABLE `user_words` (
 6 |   `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
 7 |   `word` varchar(100) NOT NULL COMMENT '统计关键词',
 8 |   `add_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '添加时间',
 9 |   PRIMARY KEY (`id`)
10 | ) ENGINE=InnoDB AUTO_INCREMENT=27 DEFAULT CHARSET=utf8;
11 | 
12 | -- ----------------------------
13 | --  Records of `user_words`
14 | -- ----------------------------
15 | BEGIN;
16 | INSERT INTO `user_words` VALUES ('1', '实战', '2018-07-31 21:28:26'), ('2', '书籍', '2018-07-31 21:28:32'), ('3', '中国', '2018-07-31 21:28:38'), ('4', '哈哈', '2018-07-31 21:28:44'), ('5', '你好', '2018-07-31 21:28:49'), ('6', '中华人民共和国', '2018-07-31 21:29:03'), ('7', '嘚瑟', '2018-07-31 21:29:16'), ('8', '炮', '2018-07-31 21:30:24'), ('9', '高兴', '2018-07-31 21:30:27'), ('10', '森森', '2018-07-31 21:30:37'), ('11', '穷', '2018-07-31 21:30:44'), ('12', '甜蜜', '2018-07-31 21:31:34'), ('13', '虾', '2018-07-31 21:31:37'), ('14', '狗', '2018-07-31 21:31:47'), ('15', '胆浆', '2018-07-31 21:32:58'), ('16', '氦', '2018-07-31 21:34:02'), ('17', '狼', '2018-07-31 21:34:08'), ('18', '玖', '2018-07-31 21:35:52'), ('19', '洽关', '2018-07-31 21:36:10'), ('20', '悠田', '2018-07-31 21:36:23'), ('21', '僵弛', '2018-07-31 21:36:57'), ('22', '老客', '2018-07-31 21:37:09'), ('23', '王羽', '2018-07-31 21:37:50'), ('24', '酒', '2018-07-31 21:38:21'), ('25', '涨', '2018-07-31 21:38:36'), ('26', '盔', '2018-07-31 21:39:22');
17 | COMMIT;
18 | 
19 | -- ----------------------------
20 | --  Table structure for `word_count_201808`
21 | -- ----------------------------
22 | DROP TABLE IF EXISTS `word_count_201808`;
23 | CREATE TABLE `word_count_201808` (
24 |   `id` int(11) NOT NULL AUTO_INCREMENT,
25 |   `word` varchar(64) NOT NULL,
26 |   `count` int(11) DEFAULT '0',
27 |   `date` date NOT NULL,
28 |   PRIMARY KEY (`id`),
29 |   UNIQUE KEY `word` (`word`,`date`)
30 | ) ENGINE=InnoDB AUTO_INCREMENT=3584 DEFAULT CHARSET=utf8;
31 | 
32 | -- ----------------------------
33 | --  Records of `word_count_201808`
34 | -- ----------------------------
35 | BEGIN;
36 | INSERT INTO `word_count_201808` VALUES ('1', '穷', '22', '2018-08-05'), ('2', '炮', '7', '2018-08-05'), ('3', '酒', '24', '2018-08-05'), ('4', '狗', '44', '2018-08-05'), ('6', '玖', '34', '2018-08-05'), ('7', '虾', '63', '2018-08-05'), ('8', '盔', '32', '2018-08-05'), ('10', '狼', '32', '2018-08-05'), ('11', '涨', '11', '2018-08-05'), ('14', '氦', '71', '2018-08-05'), ('278', '狼', '651', '2018-08-19'), ('279', '穷', '270', '2018-08-19'), ('280', '虾', '688', '2018-08-19'), ('281', '玖', '466', '2018-08-19'), ('286', '盔', '491', '2018-08-19'), ('287', '氦', '968', '2018-08-19'), ('291', '涨', '202', '2018-08-19'), ('293', '炮', '133', '2018-08-19'), ('297', '狗', '504', '2018-08-19'), ('308', '酒', '296', '2018-08-19'), ('555', '甜蜜', '1', '2018-08-19'), ('1770', '哈哈', '1', '2018-08-19'), ('3583', '森森', '1', '2018-08-19');
37 | COMMIT;
38 | 
39 | -- ----------------------------
40 | --  Table structure for `word_count_201811`
41 | -- ----------------------------
42 | DROP TABLE IF EXISTS `word_count_201811`;
43 | CREATE TABLE `word_count_201811` (
44 |   `id` int(11) NOT NULL AUTO_INCREMENT,
45 |   `word` varchar(64) NOT NULL,
46 |   `count` int(11) DEFAULT '0',
47 |   `date` date NOT NULL,
48 |   PRIMARY KEY (`id`),
49 |   UNIQUE KEY `word` (`word`,`date`)
50 | ) ENGINE=InnoDB AUTO_INCREMENT=87 DEFAULT CHARSET=utf8;
51 | 
52 | -- ----------------------------
53 | --  Records of `word_count_201811`
54 | -- ----------------------------
55 | BEGIN;
56 | INSERT INTO `word_count_201811` VALUES ('1', '穷', '10', '2018-11-18'), ('2', '虾', '26', '2018-11-18'), ('3', '氦', '33', '2018-11-18'), ('7', '狼', '24', '2018-11-18'), ('8', '狗', '16', '2018-11-18'), ('9', '盔', '29', '2018-11-18'), ('11', '涨', '7', '2018-11-18'), ('12', '玖', '24', '2018-11-18'), ('14', '酒', '8', '2018-11-18'), ('46', '炮', '6', '2018-11-18'), ('86', '书籍', '1', '2018-11-18');
57 | COMMIT;


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/jiebaService/__pycache__/jieba.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/src/main/scala/project2_wordcounts/jiebaService/__pycache__/jieba.cpython-36.pyc


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/jiebaService/jieba.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import jieba    #导入包
 4 | cut = jieba.cut
 5 | # bottle 是一个http 的开发工具
 6 | from bottle import route,run
 7 | 
 8 | # sentence :待拆分的字符串    中华人民共和国  ->
 9 | #返回值:  以空格分隔的   字符串
10 | def token(sentence):
11 |     seg_list = list(   cut(sentence)   )
12 |     return " ".join(seg_list)
13 | 
14 | #路由设置
15 | @route('/token/:sentence')
16 | def index(sentence):
17 |     print(  "====",sentence )
18 |     result = token(sentence)
19 |     return "{\"ret\":0, \"msg\":\"OK\", \"terms\":\"%s\"}" % result
20 | 
21 | #相当于  java 中的   main
22 | 
23 | if __name__ == "__main__":
24 |     #以      http://localhost:8282/token/今天是星期天       访问
25 |     run(host="localhost",port=8282)


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/dao/MysqlPool.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.streamingWC.dao
 2 | 
 3 | import java.sql.Connection
 4 | 
 5 | import com.alibaba.druid.pool.DruidDataSource
 6 | import com.mchange.v2.c3p0.ComboPooledDataSource
 7 | import org.apache.log4j.LogManager
 8 | import project2_wordcounts.streamingWC.utils.Conf
 9 | 
10 | /**
11 |  * Mysql连接池类
12 |  *
13 |  */
14 | class MysqlPool extends Serializable {   //可序列化
15 |   // 瞬态 的属性
16 |   @transient lazy val log = LogManager.getLogger(this.getClass)
17 | 
18 |   private val cpds: ComboPooledDataSource = new ComboPooledDataSource(true)
19 |   //private val dds=new DruidDataSource(  )
20 | 
21 |   private val conf = Conf.mysqlConfig    // 参数配置
22 |   try {
23 | //    dds.setUsername(conf.get("username").getOrElse("root"))
24 | //    dds.setPassword(conf.get("password").getOrElse("a"))
25 | //    dds.setUrl(conf.get("url").getOrElse("jdbc:mysql://localhost:3306/word_freq?useUnicode=true&amp;characterEncoding=UTF-8"))
26 | //    dds.setDriverClassName("com.mysql.jdbc.Driver")
27 | //    dds.setInitialSize(   3 )
28 | //    dds.setMaxActive(200)
29 |     cpds.setJdbcUrl(conf.get("url").getOrElse("jdbc:mysql://localhost:3306/word_freq?useUnicode=true&amp;characterEncoding=UTF-8"));
30 |     cpds.setDriverClass("com.mysql.jdbc.Driver");
31 |     cpds.setUser(conf.get("username").getOrElse("root"));
32 |     cpds.setPassword(conf.get("password").getOrElse("a"))
33 |     cpds.setInitialPoolSize(3)
34 |     cpds.setMaxPoolSize(Conf.maxPoolSize)
35 |     cpds.setMinPoolSize(Conf.minPoolSize)
36 |     cpds.setAcquireIncrement(5)
37 |     cpds.setMaxStatements(180)
38 |     /* 最大空闲时间,25000秒内未使用则连接被丢弃。若为0则永不丢弃。Default: 0 */
39 |     cpds.setMaxIdleTime(25000)
40 |     // 检测连接配置
41 |     cpds.setPreferredTestQuery("select id from user_words limit 1")
42 |     cpds.setIdleConnectionTestPeriod(18000)
43 |   } catch {
44 |     case e: Exception =>
45 |       log.error("[MysqlPoolError]", e)
46 |   }
47 | 
48 |   def getConnection: Connection = {
49 |     try {
50 |       return cpds.getConnection();
51 |     } catch {
52 |       case e: Exception =>
53 |         log.error("[MysqlPoolGetConnectionError]", e)
54 |         null
55 |     }
56 |   }
57 | }
58 | //单例模型:  构建方法私有化，对外提供唯一创建的方法
59 | object MysqlManager {
60 |   var mysqlManager: MysqlPool = _
61 | 
62 |   def getMysqlManager: MysqlPool = {
63 |     synchronized {
64 |       if (mysqlManager == null) {
65 |         mysqlManager = new MysqlPool
66 |       }
67 |     }
68 |     mysqlManager
69 |   }
70 | 
71 |   def main(args: Array[String]): Unit = {
72 |        val pool=MysqlManager.getMysqlManager
73 |        val con=pool.getConnection
74 |       println(  con )
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/service/MysqlService.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.streamingWC.service
 2 | 
 3 | import org.apache.log4j.LogManager
 4 | import org.apache.spark.rdd.RDD
 5 | import project2_wordcounts.streamingWC.dao.MysqlManager
 6 | import project2_wordcounts.streamingWC.utils.TimeParse
 7 | import scala.collection.mutable
 8 | 
 9 | /**
10 |  * 对数据库的存取操作。
11 |      1) 查询user_words表，取出所有用户词典
12 |      2) 统计 用户词典表中的词汇出现的次数，按月生成表格保存 。
13 |  */
14 | object MysqlService extends Serializable {
15 | 
16 |   @transient lazy val log = LogManager.getLogger(this.getClass)
17 | 
18 |   /*
19 |   DStream  -> n 个RDD   -> 应用函数  切分 sentends  -> 调用分词  -》  RDD[    (String,1 )  ]     ->  reduce   ->  (String, count)  ->   调用   service来存
20 |    */
21 |   //关于数据库的操作，请参考:
22 |   //  http://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams
23 |   // 中关于  foreachRDD 设计模式
24 |   def save(rdd: RDD[(String, Int)]) = {
25 |     if (!rdd.isEmpty) {
26 |       //按分区循环RDD, 这样一个分区的所有数据只要一个Connection操作即可.
27 |       rdd.foreachPartition(partitionRecords => {
28 |         val preTime = System.currentTimeMillis
29 |         //从连接池中获取一个连接
30 |         val conn = MysqlManager.getMysqlManager.getConnection
31 |         val statement = conn.createStatement
32 |         try {
33 |           conn.setAutoCommit(false)
34 |           partitionRecords.foreach(record => {
35 |             log.info("待操作的记录>>>>>>>" + record)
36 |             val createTime = System.currentTimeMillis()  //系统时间的hao秒
37 |             //按月建立一张新表存储数据:   按单词和时间进行区分，如单词和时间相同，则update, 不同则插入。
38 |             var sql = s"CREATE TABLE if not exists `word_count_${TimeParse.timeStamp2String(createTime, "yyyyMM")}`(`id` int(11) NOT NULL AUTO_INCREMENT,`word` varchar(64) NOT NULL,`count` int(11) DEFAULT '0',`date` date NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `word` (`word`,`date`) ) ENGINE=InnoDB  DEFAULT CHARSET=utf8;"
39 |             statement.addBatch(sql)   //将sql语句添加到批
40 |             sql = s"insert into word_count_${TimeParse.timeStamp2String(createTime, "yyyyMM")} (word, count, date) " +
41 |               s"values ('${record._1}',${record._2},'${TimeParse.timeStamp2String(createTime, "yyyy-MM-dd")}') " +
42 |               s"on duplicate key update count=count+values(count) "      //   在更新表格中每个词的统计值时，用 on duplicate key进行词频数量的累加.
43 |             log.info(   "sql:"+ sql )
44 |             //   word列为  unique列
45 |               // 如果在INSERT语句末尾指定了ON DUPLICATE KEY UPDATE，并且插入行后会导致在一个UNIQUE索引或PRIMARY KEY中出现重复值，则执行旧行UPDATE；如果不会导致唯一值列重复的问题，则插入新行
46 |             statement.addBatch(sql)
47 |             log.warn(s"[记录添加的批处理操作成功] record: ${record._1}, ${record._2}")
48 | 
49 |           })
50 |           statement.executeBatch    //执行批处理   -> 当一个 RDD中的数据量太大   batch存不下  mysql 缓存存不下，调整batch的批的大小
51 |           conn.commit
52 |           log.warn(s"[保存的批处理操作完成] 耗时: ${System.currentTimeMillis - preTime}")
53 |         } catch {
54 |           case e: Exception =>
55 |             log.error("[保存的批处理操作失败]", e)
56 |         } finally {
57 |           conn.setAutoCommit(true)
58 |           statement.close()
59 |           conn.close()
60 |         }
61 |       })
62 |     }
63 |   }
64 | 
65 |   /**
66 |    * 加载用户词典
67 |    */
68 |   def getUserWords(): mutable.HashSet[String] = {
69 |     val preTime = System.currentTimeMillis
70 |     val sql = "select distinct(word) from user_words"    //distinct 去重
71 |     val conn = MysqlManager.getMysqlManager.getConnection
72 |     val statement = conn.createStatement
73 |     try {
74 |       val rs = statement.executeQuery(sql)
75 |       val words = mutable.HashSet[String]()
76 |       while (rs.next) {
77 |         words += rs.getString("word")
78 |       }
79 |       log.warn(s"[loadSuccess] load user words from db count: ${words.size}\ttime elapsed: ${System.currentTimeMillis - preTime}")
80 |       words
81 |     } catch {
82 |       case e: Exception =>
83 |         log.error("[loadError] error: ", e)
84 |         mutable.HashSet[String]()
85 |     } finally {
86 |       statement.close()
87 |       conn.close()
88 |     }
89 |   }
90 | 
91 | 
92 |   def main(args: Array[String]): Unit = {
93 |        val set= MysqlService.getUserWords()
94 |         print(  set )
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/service/SegmentService.scala:
--------------------------------------------------------------------------------
  1 | package project2_wordcounts.streamingWC.service
  2 | 
  3 | import java.net.URLEncoder
  4 | import java.nio.charset.StandardCharsets
  5 | 
  6 | import org.apache.log4j.{Level, LogManager, Logger}
  7 | import project2_wordcounts.streamingWC.utils.Conf
  8 | import scalaj.http._
  9 | import spray.json._
 10 | import spray.json.DefaultJsonProtocol._
 11 | 
 12 | import scala.collection.mutable.HashSet
 13 | import scala.collection.mutable.Map
 14 | 
 15 | /*
 16 | 访问分词服务的业务类
 17 |  */
 18 | object SegmentService extends Serializable {
 19 |   @transient lazy val log = LogManager.getLogger(this.getClass)
 20 | 
 21 |   def main(args: Array[String]): Unit = {
 22 |     Logger.getLogger("org").setLevel(Level.ALL) //配置日志
 23 |     //单次请求测试
 24 |     //val segments=SegmentService.segment("http://localhost:8282/token/","今天是星期天,今天天气是不错")
 25 |     //print(  segments )
 26 |     //如有网络抖动的情况下，重新联接三次.
 27 |     //val segments = SegmentService.retry(3)(SegmentService.segment("http://localhost:8282/token/", "今天是星期天,今天天气是不错"))
 28 |     //print(segments)
 29 |     //3. 对拆分出来的词在用户词典表中 查找，并计数
 30 |     val record="今天是星期天,今天天气是不错"
 31 |     val wordDic=new HashSet[String]()
 32 |     wordDic.add( "今天")
 33 |     wordDic.add("星期天")
 34 |     val map=mapSegment(   record, wordDic)
 35 |     print( map )
 36 |   }
 37 | 
 38 |   /**
 39 |    * 将sentence发给分词服务器，获取返回的切分好的中文词汇，然后再到 userDict 查是否有这个词，如果有，则统计数据量.
 40 |    * 在Dstream中调用，传入记录内容和词典，通过retry(3)(segment(postUrl, record))实现失败重试3次方案，根据分词结果和词典，进行词典指定词的词频统计，并以Map[Word,count]返回
 41 |    *
 42 |    * @return
 43 |    */
 44 |   def mapSegment(record: String, wordDic: HashSet[String]): Map[String, Int] = {
 45 |     val preTime = System.currentTimeMillis
 46 |     val keyCount = Map[String, Int]()
 47 |     if (record == "" || record.isEmpty()) {
 48 |       log.warn(s"待切分语句为空: ${record}")
 49 |       keyCount
 50 |     } else {
 51 |       val postUrl = Conf.segmentorHost + "/token/"
 52 |       try {
 53 |         val wordsSet = retry(3)(segment(postUrl, record)) // 失败重试3次
 54 |         log.info(s"[拆分成功] 记录: ${record}\t耗时: ${System.currentTimeMillis - preTime}")
 55 |         // 进行词语统计
 56 |         //val keyCount = Map[String, Int]()
 57 |         for (word <- wordDic) {
 58 |           if (wordsSet.contains(word))
 59 |             keyCount += word -> 1
 60 |         }
 61 |         log.info(s"[keyCountSuccess] words size: ${wordDic.size} (entitId_createTime_word_language, 1):\n${keyCount.mkString("\n")}")
 62 |         keyCount
 63 |       } catch {
 64 |         case e: Exception => {
 65 |           log.error(s"[mapSegmentApiError] mapSegment error\tpostUrl: ${postUrl}${record}", e)
 66 |           keyCount
 67 |         }
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   /*
 73 |       根据传入的 url和 content发送请求，并解析返回的结果，将分词结果以HashSet形式返回
 74 |       参数: url:  http://localhost:8282/token/
 75 |             content:   今天是星期天
 76 |       返回值:   从jsonr的terms中取,按" "切分，形成数组 HashSet   ->  scala集合分两类: mutable,immutable
 77 |                 HashSet: mutable
 78 |    */
 79 |   def segment(url: String, content: String): HashSet[String] = {
 80 |     val timer = System.currentTimeMillis()
 81 |     //地址栏的参数编码
 82 |     val c = URLEncoder.encode(content, StandardCharsets.UTF_8.toString)
 83 |     log.info("发送的请求为:" + url + "\t" + content + "\t" + c)
 84 |     //var response = Http(url + content).header("Charset", "UTF-8").charset("UTF-8").asString   //发送请求，得到响应
 85 |     var response = Http(url + c).header("Charset", "UTF-8").charset("UTF-8").asString //发送请求，得到响应
 86 |     log.info("响应为:" + response.code + "\t内容:" + response.body.toString)
 87 | 
 88 |     val dur = System.currentTimeMillis() - timer
 89 |     if (dur > 20) // 输出耗时较长的请求
 90 |       log.warn(s"[longVisit]>>>>>> api: ${url}${content}\ttimer: ${dur}")
 91 | 
 92 |     val words = HashSet[String]()
 93 |     response.code match { //匹配响应码
 94 |       case 200 => {
 95 |         //获取响应的结果，进行匹配
 96 |         response.body.parseJson.asJsObject.getFields("ret", "msg", "terms") match {
 97 |           case Seq(JsNumber(ret), JsString(msg), JsString(terms)) => {
 98 |             if (ret.toInt != 0) { //解析结果为空.
 99 |               log.error(s"[segmentRetError] vist api: ${url}?content=${content}\tsegment error: ${msg}")
100 |               words
101 |             } else {
102 |               //解析到了 单词组，则切分
103 |               val tokens = terms.split(" ")   // Array
104 |               tokens.foreach(token => {
105 |                 words += token
106 |               })
107 |               words
108 |             }
109 |           }
110 |           case _ => words
111 |         }
112 |       }
113 |       case _ => { //响应码为其它则异常,返回空words
114 |         log.error(s"[segmentResponseError] vist api: ${url}?content=${content}\tresponse code: ${response.code}")
115 |         // [segmentResponseError] vist api: http://localhost:8282/content=xxxx\tresponse code: 500
116 |         words
117 |       }
118 |     }
119 |   }
120 | 
121 |   /**
122 |    * 重试函数:  实现对函数 fn的n次重复调用。 在http请求时，会出现网络错误，这样这个函数就可以按要求的次数重新发送请求，如超出n次，则抛出异常
123 |    *
124 |    * @param n
125 |    * @param fn
126 |    * @return
127 |    */
128 |   @annotation.tailrec
129 |   def retry[T](n: Int)(fn: => T): T = {
130 |     /*
131 |     scala.util.Try 的结构与 Either 相似，Try 是一个 sealed 抽象类，具有两个子类，分别是 Succuss(x) 和 Failure(exception)。  模式匹配(  Option: Some(x)/None )
132 |     Succuss会保存正常的返回值。 Failure 总是保存 Throwable 类型的值。
133 |      */
134 |     util.Try {
135 |       fn
136 |     } match { //利用scala中的Try函数:
137 |       case util.Success(x) => {
138 |         log.info(s"第${4-n}次请求")
139 |         x
140 |       }
141 |       case _ if n > 1 => {   // _ 代表任意类型及任意值
142 |         log.warn(s"[重试第 ${4 - n}次]")
143 |         retry(n - 1)(fn)     // 递归
144 |       }
145 |       case util.Failure(e) => {
146 |         log.error(s"[segError] 尝试调用API失败了三次", e)
147 |         throw e
148 |       }
149 |     }
150 |   }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/utils/BroadcastWrapper.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.streamingWC.utils
 2 | 
 3 | 
 4 | import java.io.{ObjectInputStream, ObjectOutputStream}
 5 | import org.apache.spark.broadcast.Broadcast
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /*
10 |    在集群节点间进行数据传输会有大量序列化和反序列化操作， 通过引入broadcast对该变量进行广播。
11 |    广播后的变量，会保证每个Executor的内存中，只驻留一份变量副本，而Executor中的task执行时共享该Executor中的那份变量副本。
12 |    这样的话，可以大大减少变量副本的数量，从而减少网络传输的性能开销，并减少对Executor内存的占用开销，降低GC的频率。
13 | 
14 | 
15 |    @transient ：  瞬态化，即这个变量不序列化
16 |  */
17 | case class BroadcastWrapper[T: ClassTag](
18 |                                           @transient private val ssc: StreamingContext,
19 |                                           @transient private val _v: T) {
20 |   // 创建广播变量
21 |   @transient private var v = ssc.sparkContext.broadcast(_v)
22 | 
23 |   /**
24 |    * 广播变量是只读的，可以利用spark的unpersist()，它按照LRU( lease Recently used)最近最久没有使用原则删除老数据。
25 |    *
26 |    * @param newValue
27 |    * @param blocking
28 |    */
29 |   def update(newValue: T, blocking: Boolean = false): Unit = {
30 |     v.unpersist(blocking) //删除缓存
31 |     v = ssc.sparkContext.broadcast(newValue)
32 |   }
33 | 
34 |   def value: T = v.value //对外提供一个函数用于访问这个广播变量 , 体现了封装
35 | 
36 |   private def writeObject(out: ObjectOutputStream): Unit = {
37 |     out.writeObject(v)
38 |   }
39 | 
40 |   private def readObject(in: ObjectInputStream): Unit = {
41 |     v = in.readObject().asInstanceOf[Broadcast[T]]
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/utils/Conf.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.streamingWC.utils
 2 | 
 3 | /*
 4 | 系统的配置
 5 |  */
 6 | object Conf {
 7 | 
 8 |   val nGram = 3    //分词器单位长度的参数
 9 |   val updateFreq = 300000 //5min
10 | 
11 |   // 分词服务 api
12 |   val segmentorHost = "http://localhost:8282"
13 | 
14 |   // spark 参数
15 |   val master = "local[*]"
16 |   val localDir = "./tmp"
17 |   val perMaxRate = "5"    // 设定对目标topic每个partition每秒钟拉取的数据条数
18 |   val interval = 3 // seconds
19 |   val executorMem = "1G"    //   内存数/executor
20 |   val coresMax = "3"     //总共最多几个核
21 | 
22 |   // kafka configuration
23 |   val brokers = "localhost:9092,localhost:9093,localhost:9094"
24 |   val zk = "localhost:2181"
25 |   val group = "wordFreqGroup"
26 |   val topics = "comments"
27 | 
28 |   // mysql configuration
29 |   val mysqlConfig = Map("url" -> "jdbc:mysql://localhost:3306/word_freq?characterEncoding=UTF-8", "username" -> "root", "password" -> "a")
30 |   val maxPoolSize = 5
31 |   val minPoolSize = 2
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/streamingWC/utils/TimeParse.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.streamingWC.utils
 2 | 
 3 | import org.joda.time.{ DateTimeZone, _ }
 4 | 
 5 | /*
 6 | 时间转换工具类
 7 |  */
 8 | object TimeParse extends Serializable {
 9 |   /*
10 |   将字符串类型的时间戳转为 指定时间格式字符串
11 |    */
12 |   def timeStamp2String(timeStamp: String, format: String): String = {
13 |     val ts = timeStamp.toLong * 1000;
14 |     new DateTime(ts).toDateTime.toString(format)
15 |   }
16 | 
17 |   /*
18 |   将Long类型的时间戳转为 指定时间格式字符串
19 |    */
20 |   def timeStamp2String(timeStamp: Long, format: String): String = {
21 |     new DateTime(timeStamp).toDateTime.toString(format)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/wordSeqGenerator/ConsumerTest.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.wordSeqGenerator
 2 | 
 3 | import java.util.concurrent._
 4 | import java.util.{Collections, Properties}
 5 | import kafka.utils.Logging
 6 | import org.apache.kafka.clients.consumer.{ConsumerConfig, KafkaConsumer}
 7 | import scala.collection.JavaConversions._
 8 | 
 9 | /*
10 | 用于测试(本身这个项目中使用的是spark streaming来完成消费端的功能的)
11 |      请测试WordSeqProducer是否成功发送的消费者程序
12 |  */
13 | 
14 | object ConsumerTest extends App { //继承自App，所以不用写 main方法 即这个object为程序入口
15 |   val topic = "comments" //主题名
16 |   val brokers = "localhost:9092,localhost:9093,localhost:9094" //kafka服务器地址
17 |   val groupId = "yc74streaming" //组编号
18 |   val example = new ConsumerTest(brokers, groupId, topic)
19 |   example.run() //运行消费端
20 | }
21 | 
22 | 
23 | class ConsumerTest(val brokers: String,
24 |                    val groupId: String,
25 |                    val topic: String) extends Logging {
26 | 
27 |   val props = createConsumerConfig(brokers, groupId)
28 |   val consumer = new KafkaConsumer[String, String](props)
29 | 
30 |   def shutdown() = {
31 |     if (consumer != null)
32 |       consumer.close();
33 |   }
34 |   //消费者的参数
35 |   def createConsumerConfig(brokers: String, groupId: String): Properties = {
36 |     val props = new Properties()
37 |     props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
38 |     props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId)
39 |     props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true") //是否自动提交 消息偏移量
40 |     props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000") //自动提交的时间间隔
41 |     props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000") //会话超时时间
42 |     //反序列化工具类
43 |     props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
44 |     props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer")
45 |     props
46 |   }
47 | 
48 |   def run() = {
49 |     //订阅主题
50 |     consumer.subscribe(Collections.singletonList(this.topic))
51 |     //启动线程池   :  newSingleThreadExecutor一个单线程化的线程池，它只会用唯一的工作线程来执行任务，保证所有任务按照指定顺序(FIFO, LIFO, 优先级)执行
52 |     //newCachedThreadPool:一个可缓存线程池，如果线程池长度超过处理需要，可灵活回收空闲线程，若无可回收，则新建线程
53 |     //newFixedThreadPool: 一个定长线程池，可控制线程最大并发数，超出的线程会在队列中等待
54 |     // newScheduledThreadPool: 一个定长线程池，支持定时及周期性任务执行
55 |     Executors.newSingleThreadExecutor.execute(new Runnable {
56 |       override def run(): Unit = {
57 |         while (true) {
58 |           //1秒拉取一次数据
59 |           val records = consumer.poll(1000)
60 | 
61 |           for (record <- records) {
62 |             System.out.println("Received message: (" + record.key() + ", " + record.value() + ") at offset " + record.offset()+" at partition "+record.partition())
63 |           }
64 |         }
65 |       }
66 |     })
67 |   }
68 | }


--------------------------------------------------------------------------------
/src/main/scala/project2_wordcounts/wordSeqGenerator/WordSeqProducer.scala:
--------------------------------------------------------------------------------
 1 | package project2_wordcounts.wordSeqGenerator
 2 | 
 3 | import java.util.Properties
 4 | import scala.util.Random
 5 | import org.apache.kafka.clients.producer.KafkaProducer
 6 | import org.apache.kafka.clients.producer.ProducerRecord
 7 | 
 8 | /*
 9 | 模拟类用于连接kafka,并向其推送数据.
10 | 后面可以用 pachong 来代替.
11 | 启动与关闭:
12 |         1. 先启 kafka自带的 zk(端口2181): bin/zookeeper-server-start.sh config/zookeeper.properties 1>/dev/null 2>&1 &
13 |         2. 再启动 kafka(端口9092):
14 |               bin/kafka-server-start.sh config/server.properties  &
15 |               bin/kafka-server-start.sh config/server-1.properties  &
16 |               bin/kafka-server-start.sh config/server-2.properties  &
17 | 
18 |         3. 关闭:  sh kafka-server-stop.sh
19 |                   sh zookeeper-server-stop.sh
20 | 
21 |    创建主题: bin/kafka-topics.sh --create --zookeeper localhost:2181  --replication-factor 3  --partitions 3 --topic  comments
22 |    主题列表:  bin/kafka-topics.sh --list --zookeeper localhost:2181
23 |    查看主题中消息详情: bin/kafka-topics.sh --describe --zookeeper localhost:2181    --topic comments
24 |    发送消息: bin/kafka-console-producer.sh --broker-list localhost:9092,localhost:9093,localhost:9094 --topic comments
25 |    消费消息:
26 |      bin/kafka-console-consumer.sh --bootstrap-server localhost:9092,localhost:9093,localhost:9094  --topic   comments  --from-beginning
27 |  */
28 | object WordSeqProducer extends App {
29 |   val events = 10 //生成评论条数
30 |   val topic = "comments"   //主题名
31 |   val brokers = "localhost:9092,localhost:9093,localhost:9094"  // kafka brokers
32 | 
33 |   val rnd = new Random()    //用于随机地生成文本。
34 | 
35 |   val props = new Properties()   //kafka producer参数
36 |   props.put("bootstrap.servers", brokers)
37 |   props.put("client.id", "wordFreqGenerator")
38 |   props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
39 |   props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
40 | 
41 |   val producer = new KafkaProducer[String, String](props)
42 |   val t = System.currentTimeMillis()   // 获取系统当前时间，用于性能检测
43 |   // 读取汉字字典
44 |   val source = scala.io.Source.fromFile("data/hanzi.txt")
45 |   val lines = try source.mkString finally source.close()
46 |   //控制生成 events 条消息
47 |   for (nEvents <- Range(0, events)) {
48 |     // 生成模拟评论数据(user, comment)
49 |     val sb = new StringBuilder()
50 |     //每条消息最多200个字符
51 |     for (ind <- Range(   0, rnd.nextInt(200)    )) {
52 |       sb += lines.charAt(     rnd.nextInt(lines.length())    )
53 |     }
54 |     val userName = "user_" + rnd.nextInt(100)    // 主题   键        值
55 |     val data = new ProducerRecord[String, String](topic, userName, sb.toString())
56 |     producer.send(data)
57 |   }
58 |   System.out.println("每秒可以发送消息: " + events * 1000 / (System.currentTimeMillis() - t))
59 |   producer.close()
60 | }
61 | 


--------------------------------------------------------------------------------
/streaming74projects.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4">
3 |   <component name="FacetManager">
4 |     <facet type="Python" name="Python">
5 |       <configuration sdkName="" />
6 |     </facet>
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/target/classes/LogGenerator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/LogGenerator.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/LogAnalysis$$anonfun$main$1$$typecreator3$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/LogAnalysis$$anonfun$main$1$$typecreator3$1.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/LogAnalysis$$anonfun$main$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/LogAnalysis$$anonfun$main$1.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/LogAnalysis$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/LogAnalysis$.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/LogAnalysis.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/LogAnalysis.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/Record$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/Record$.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/Record.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project1_logAnalysis/Record.class


--------------------------------------------------------------------------------
/target/classes/project1_logAnalysis/db.sql:
--------------------------------------------------------------------------------
 1 | create database log_analysis;
 2 | 
 3 | --需求1: 筛选出所有的 error,利用spark sql 转存数据库 
 4 | --   id       level       method      content
 5 | use log_analysis;
 6 | 
 7 | CREATE TABLE `important_logs` (
 8 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 9 |   `log_level` varchar(255) NOT NULL,
10 |   `method` varchar(255) NOT NULL,
11 |   `content` varchar(500) NOT NULL,
12 |   PRIMARY KEY (`id`)
13 | ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8;
14 | 
15 | 
16 | --需求二: 累计这个级别 日志出现的总次数,不需要表，只直接输出
17 | 
18 | --需求三: 在以 1秒为批处理时间间隔, 这个级别在过去的3个时间窗口内，每两个 slide intervals的次数，输出.
19 | 


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/jiebaService/jieba.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import jieba    #导入包
 4 | cut = jieba.cut
 5 | # bottle 是一个http 的开发工具
 6 | from bottle import route,run
 7 | 
 8 | # sentence :待拆分的字符串    中华人民共和国  ->
 9 | #返回值:  以空格分隔的   字符串
10 | def token(sentence):
11 |     seg_list = list(   cut(sentence)   )
12 |     return " ".join(seg_list)
13 | 
14 | #路由设置
15 | @route('/token/:sentence')
16 | def index(sentence):
17 |     print(  "====",sentence )
18 |     result = token(sentence)
19 |     return "{\"ret\":0, \"msg\":\"OK\", \"terms\":\"%s\"}" % result
20 | 
21 | #相当于  java 中的   main
22 | 
23 | if __name__ == "__main__":
24 |     #以      http://localhost:8282/token/今天是星期天       访问
25 |     run(host="localhost",port=8282)


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlManager$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlManager$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlManager.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlManager.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$1.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$2.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$2.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$3.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool$$anonfun$3.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/dao/MysqlPool.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/BroadcastWrapper$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/BroadcastWrapper$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/BroadcastWrapper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/BroadcastWrapper.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/Conf$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/Conf$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/Conf.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/Conf.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/TimeParse$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/TimeParse$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/streamingWC/utils/TimeParse.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/streamingWC/utils/TimeParse.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$$anon$1$$anonfun$run$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$$anon$1$$anonfun$run$1.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$$anon$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$$anon$1.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$delayedInit$body.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest$delayedInit$body.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/ConsumerTest.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$$anonfun$1$$anonfun$apply$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$$anonfun$1$$anonfun$apply$1.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$$anonfun$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$$anonfun$1.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$delayedInit$body.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer$delayedInit$body.class


--------------------------------------------------------------------------------
/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangyingchengqi/sparkStreaming_projects/d901219fe96017f868ec0377519c652e320657de/target/classes/project2_wordcounts/wordSeqGenerator/WordSeqProducer.class


--------------------------------------------------------------------------------