├── .gitignore ├── KBQA └── patternREfO │ ├── data │ ├── actorName.txt │ ├── get_dict.txt │ └── movieName.txt │ ├── get_dict.sh │ ├── query.py │ └── utils │ ├── __init__.py │ ├── rules.py │ └── word_tagging.py ├── README.md ├── ie ├── craw │ ├── baidu_baike │ │ ├── baidu_baike │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── baidu_baike.py │ │ └── scrapy.cfg │ ├── craw_all_baidu │ │ ├── baidu_baike │ │ │ ├── __init__.py │ │ │ ├── commands │ │ │ │ ├── __init__.py │ │ │ │ └── crawlall.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ ├── setup.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── baidu_baike-10.py │ │ │ │ ├── baidu_baike-2.py │ │ │ │ ├── baidu_baike-3.py │ │ │ │ ├── baidu_baike-4.py │ │ │ │ ├── baidu_baike-5.py │ │ │ │ ├── baidu_baike-6.py │ │ │ │ ├── baidu_baike-7.py │ │ │ │ ├── baidu_baike-8.py │ │ │ │ ├── baidu_baike-9.py │ │ │ │ └── baidu_baike.py │ │ ├── creat_mysql.md │ │ └── scrapy.cfg │ ├── craw_all_hudong │ │ ├── craw_all_hudong │ │ │ ├── __init__.py │ │ │ ├── commands │ │ │ │ ├── __init__.py │ │ │ │ └── crawlall.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ ├── setup.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── hudong_baike.py │ │ ├── creat_mysql.md │ │ └── scrapy.cfg │ ├── craw_without_spider │ │ ├── mysql │ │ │ ├── creat_sql.txt │ │ │ └── help_mysql.txt │ │ └── utils │ │ │ ├── basic_info.py │ │ │ ├── craw.py │ │ │ └── kg_movie_movie.sql │ ├── hudong_baike │ │ ├── hudong_baike │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── hudong_baike.py │ │ └── scrapy.cfg │ ├── news_spider │ │ ├── news │ │ │ └── __init__ │ │ ├── news_spider │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── huxiu_spider.py │ │ ├── readme.md │ │ └── scrapy.cfg │ └── weixin_spider │ │ ├── scrapy.cfg │ │ └── weixin_spider │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ └── weixin_spiders.py ├── deepdive │ ├── app.ddlog │ ├── db.url │ ├── deepdive.conf │ ├── input │ │ └── __init__.py │ ├── start_posql.sh │ └── udf │ │ ├── __init__.py │ │ ├── baidu_baike │ │ ├── baidu_baike │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── baidu_baike.py │ │ └── scrapy.cfg │ │ ├── extract_play_features.py │ │ ├── get_actor_movie.py │ │ ├── map_actor_mention.py │ │ ├── map_movie_mention.py │ │ ├── map_play_candidate.py │ │ ├── nlp_markup.sh │ │ ├── supervise_play.py │ │ └── trans.py ├── re_cnn_att │ ├── clean.py │ ├── data │ │ └── __init__.py │ ├── gen_re_from_baidu.py │ └── word2vec.py └── struct_to_rdf │ ├── baidu2neo4j │ ├── __init__.py │ ├── clean.py │ ├── cleanFile.py │ ├── gen_disambi_infobox.py │ ├── get_subject.py │ ├── header_file │ │ ├── disambi_headers.csv │ │ ├── disambi_infobox_header.csv │ │ ├── disambi_redirect_header.csv │ │ ├── disambi_subject_header.csv │ │ ├── redirect_header.csv │ │ ├── subject_header.csv │ │ ├── title_disambi_header.csv │ │ └── title_header.csv │ └── remove_disambi.py │ └── movie_actor │ ├── clean_actor.py │ ├── clean_mysql.py │ ├── complete_mysql.py │ ├── get_ttl.bat │ ├── get_ttl.sh │ ├── kg_demo_mapping_baidu_baike.ttl │ └── kg_movie_tultle.owl ├── img ├── actor_movie_genre.png ├── baike.png ├── example_REfO_KBQA.png ├── example_d2rq.png └── example_elastic_ss.png ├── knowledge_fusion └── silk │ ├── .idea │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── silk.iml │ └── workspace.xml │ ├── batch_link.py │ └── run.py ├── requirement.text └── semantic_search └── elasticsearch ├── data └── __init__.py ├── query.py └── utils ├── __init__.py ├── build_dict.py ├── get_ac_attr.py ├── get_json.py ├── get_total_val.py ├── insert.py ├── query_cmd.sh └── views.py /.gitignore: -------------------------------------------------------------------------------- 1 | KBQA/actorName.txt 2 | KBQA/movieName.txt 3 | *.json 4 | *.pyc 5 | semantic_search/elasticsearch/data/*.txt 6 | semantic_search/elasticsearch/data/attr_ac.pkl 7 | ie/deepdive/udf/bazaar/* 8 | ie/deepdive/*.txt 9 | ie/deepdive/run/* 10 | ie/deepdive/*.csv 11 | *.txt 12 | *.csv 13 | ie/craw/craw_all_baidu/craws/* 14 | ie/re_cnn_att/data/*.csv 15 | ie/re_cnn_att/data/*.txt 16 | ie/re_cnn_att/data/*.pkl 17 | ie/re_cnn_att/data/*.json 18 | ie/re_cnn_att/thirdpart/* 19 | -------------------------------------------------------------------------------- /KBQA/patternREfO/data/get_dict.txt: -------------------------------------------------------------------------------- 1 | use baidu_baike; 2 | select actor_chName from actor into outfile '/var/lib/mysql-files/baidu_actorName.txt'; 3 | select movie_chName from movie into outfile '/var/lib/mysql-files/baidu_movieName.txt'; 4 | use hudong_baike; 5 | select actor_chName from actor into outfile '/var/lib/mysql-files/hudong_actorName.txt'; 6 | select movie_chName from movie into outfile '/var/lib/mysql-files/hudong_movieName.txt'; 7 | -------------------------------------------------------------------------------- /KBQA/patternREfO/get_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Export dict for movie and actor in hudong and baidu DB; 4 | # You need change the user and pwd for your own DB; 5 | mysql -uroot -pnlp < ./data/get_dict.txt 6 | 7 | sudo cp /var/lib/mysql-files/*Name.txt . 8 | 9 | cat baidu_actorName.txt hudong_actorName.txt | sort -u > actorTmp.txt 10 | cat baidu_movieName.txt hudong_movieName.txt | sort -u > movieTmp.txt 11 | # Append "nz" and "nr" tag for jieba 12 | awk '{print $0 " nr"}' actorTmp.txt > actorName.txt 13 | awk '{print $0 " nz"}' movieTmp.txt > movieName.txt 14 | 15 | # Remove redundant file 16 | rm ^[am].*Name.txt 17 | -------------------------------------------------------------------------------- /KBQA/patternREfO/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from SPARQLWrapper import SPARQLWrapper, JSON 5 | from utils.word_tagging import Tagger 6 | from utils.rules import customize_rules 7 | 8 | if __name__ == "__main__": 9 | print("init...........") 10 | sparql_base = SPARQLWrapper("http://localhost:3030/kg_movie/query") 11 | tagger = Tagger(['data/actorName.txt', 'data/movieName.txt']) 12 | rules = customize_rules() 13 | print("done \n") 14 | 15 | while True: 16 | print("Please input your question: ") 17 | default_question = input() 18 | seg_list = tagger.get_word_objects(default_question) 19 | 20 | for rule in rules: 21 | query = rule.apply(seg_list) 22 | if query: 23 | sparql_base.setQuery(query) 24 | sparql_base.setReturnFormat(JSON) 25 | results = sparql_base.query().convert() 26 | 27 | if not results["results"]["bindings"]: 28 | print("No answer found :(") 29 | continue 30 | for result in results["results"]["bindings"]: 31 | print("Result: ", result["x0"]["value"]) 32 | -------------------------------------------------------------------------------- /KBQA/patternREfO/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/KBQA/patternREfO/utils/__init__.py -------------------------------------------------------------------------------- /KBQA/patternREfO/utils/rules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import re 5 | from refo import finditer, Predicate, Star, Any 6 | 7 | # SPARQL config 8 | SPARQL_PREAMBLE = u""" 9 | PREFIX : 10 | PREFIX rdf: 11 | PREFIX rdfs: 12 | """ 13 | 14 | SPARQL_TEM = u"{preamble}\n" + \ 15 | u"SELECT DISTINCT {select} WHERE {{\n" + \ 16 | u"{expression}\n" + \ 17 | u"}}\n" 18 | 19 | INDENT = " " 20 | 21 | class W(Predicate): 22 | """object-oriented regex for words""" 23 | def __init__(self, token=".*", pos=".*"): 24 | self.token = re.compile(token + "$") 25 | self.pos = re.compile(pos + "$") 26 | super(W, self).__init__(self.match) 27 | 28 | def match(self, word): 29 | m1 = self.token.match(word.token) 30 | m2 = self.pos.match(word.pos) 31 | return m1 and m2 32 | 33 | class Rule(object): 34 | def __init__(self, condition=None, action=None): 35 | assert condition and action 36 | self.condition = condition 37 | self.action = action 38 | 39 | def apply(self, sentence): 40 | matches = [] 41 | for m in finditer(self.condition, sentence): 42 | i, j = m.span() 43 | matches.extend(sentence[i:j]) 44 | if __name__ == '__main__': 45 | pass 46 | return self.action(matches) 47 | 48 | def who_is_question(x): 49 | select = u"?x0" 50 | 51 | sparql = None 52 | for w in x: 53 | if w.pos == "nr" or w.pos == "x": 54 | e = u" ?a :actor_chName '{person}'. \n \ 55 | ?a :actor_bio ?x0".format(person=w.token) 56 | 57 | sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, 58 | select=select, 59 | expression=INDENT + e) 60 | break 61 | return sparql 62 | 63 | def where_is_from_question(x): 64 | select = u"?x0" 65 | 66 | sparql = None 67 | for w in x: 68 | if w.pos == "nr" or w.pos == "x" or w.pos == "nrt": 69 | e = u" ?a :actor_chName '{person}'.\n \ 70 | ?a :actor_birthPlace ?x0".format(person=w.token) 71 | 72 | sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, 73 | select=select, 74 | expression=INDENT + e) 75 | break 76 | return sparql 77 | 78 | 79 | def movie_intro_question(x): 80 | select = u"?x0" 81 | 82 | sparql = None 83 | for w in x: 84 | if w.pos == "nz": 85 | e = u" ?a :movie_chName '{person}'. \n \ 86 | ?a :movie_bio ?x0".format(person=w.token) 87 | 88 | sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, 89 | select=select, 90 | expression=INDENT + e) 91 | break 92 | return sparql 93 | 94 | def customize_rules(): 95 | # some rules for matching 96 | # TODO: customize your own rules here 97 | person = (W(pos="nr") | W(pos="x") | W(pos="nrt") | W(pos="nz")) 98 | movie = (W(pos="nz")) 99 | place = (W("出生地") | W("出生")) 100 | intro = (W("简介") | W(pos="介绍")) 101 | 102 | rules = [ 103 | 104 | Rule(condition=W(pos="r") + W("是") + person | \ 105 | person + W("是") + W(pos="r"), 106 | action=who_is_question), 107 | 108 | Rule(condition=person + Star(Any(), greedy=False) + place + Star(Any(), greedy=False), 109 | action=where_is_from_question), 110 | 111 | Rule(condition=movie + Star(Any(), greedy=False) + intro + Star(Any(), greedy=False) , 112 | action=movie_intro_question) 113 | 114 | ] 115 | return rules 116 | -------------------------------------------------------------------------------- /KBQA/patternREfO/utils/word_tagging.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | """ 4 | 5 | @author: SimmerChan 6 | 7 | @contact: hsl7698590@gmail.com 8 | 9 | @file: word_tagging.py 10 | 11 | @time: 2017/12/20 15:31 12 | 13 | @desc: 定义Word类的结构;定义Tagger类,实现自然语言转为Word对象的方法。 14 | 15 | """ 16 | import jieba 17 | import jieba.posseg as pseg 18 | 19 | 20 | class Word(object): 21 | def __init__(self, token, pos): 22 | self.token = token 23 | self.pos = pos 24 | 25 | 26 | class Tagger: 27 | def __init__(self, dict_paths): 28 | # TODO 加载外部词典 29 | for p in dict_paths: 30 | jieba.load_userdict(p) 31 | 32 | def get_word_objects(self, sentence): 33 | """ 34 | Get :class:WOrd(token, pos) 35 | """ 36 | return [Word(bytes.decode(word.encode('utf-8')), tag) for word, tag in pseg.cut(sentence)] 37 | 38 | if __name__ == '__main__': 39 | tagger = Tagger(['../data/actorName.txt', '../data/movieName.txt']) 40 | while True: 41 | s = input() 42 | print("tagger.get_word_objects(s): ", tagger.get_word_objects(s)) 43 | for i in tagger.get_word_objects(s): 44 | print(i.token, i.pos) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **knowledge graph,从零开始构建知识图谱,涵盖基础知识、构建理论、构建实战,从理论到实现。** 3 | 4 | ## 一、基础知识 5 | 1. [知识图谱基础 之 一.知识图谱基本概念](https://www.ljjyy.com/archives/2019/11/100629.html) 6 | 2. [知识图谱基础 之 二.知识表示与知识建模](https://www.ljjyy.com/archives/2019/11/100605.html) 7 | 3. [知识图谱基础 之 三.知识抽取](https://www.ljjyy.com/archives/2019/11/100606.html) 8 | 4. [知识图谱基础 之 四.知识挖掘](https://www.ljjyy.com/archives/2019/11/100607.html) 9 | 5. [知识图谱基础 之 五.知识存储](https://www.ljjyy.com/archives/2019/11/100608.html) 10 | 6. [知识图谱基础 之 六.知识融合](https://www.ljjyy.com/archives/2019/11/100609.html) 11 | 7. [知识图谱基础 之 七.知识推理](https://www.ljjyy.com/archives/2019/11/100610.html) 12 | 8. [知识图谱基础 之 八.语义搜索](https://www.ljjyy.com/archives/2019/11/100611.html) 13 | 9. [知识图谱基础 之 九.知识问答](https://www.ljjyy.com/archives/2019/11/100612.html) 14 | 15 | ## 二、论文方面(构建理论) 16 | 17 | 论文主要推荐两篇文章 18 | 19 | 1. 清华大学杨玉基的“[一种准确而高效的领域知识图谱构建方法](http://www.doc88.com/p-9979131856838.html)”。讲述了怎么通过4步进行半自动话的构建领域知识图谱,参考价值极大,步骤清晰。 20 | 21 | 2. 华东理工大学胡芳槐的博士论文“[基于多种数据源的中文知识图谱构建方法研究](http://www.doc88.com/p-0784652186719.html)”,这篇文章讲了怎么通过多数据源去构建通用知识图谱和行业知识图谱,比较详细的介绍了一些构建技术,具备一定参考价值。 22 | 23 | ## 三、博客方面(构建实战) 24 | 25 | 《从零开始学习知识图谱》系列文章,通过实战码代码,一步一步教你怎么构建一个电影领域知识图谱及百科知识图谱。 26 | 1. [从零开始学习知识图谱(一):电影知识图谱构建 1.半结构化数据的获取](https://www.ljjyy.com/archives/2019/10/100591.html) 27 | 2. [从零开始学习知识图谱(二):电影知识图谱构建 2.结构化数据到RDF以及基于Apache jena交互](https://www.ljjyy.com/archives/2019/10/100592.html) 28 | 3. [从零开始学习知识图谱(三):电影知识图谱构建 3.基于REfO的简单知识问答](https://www.ljjyy.com/archives/2019/10/100593.html) 29 | 4. [从零开始学习知识图谱(四):电影知识图谱构建 4.基于ElasticSearch的简单语义搜索](https://www.ljjyy.com/archives/2019/10/100594.html) 30 | 5. [从零开始学习知识图谱(五):电影知识图谱构建 5.基于Deepdive非结构化文本关系抽取](https://www.ljjyy.com/archives/2019/10/100595.html) 31 | 6. [从零开始学习知识图谱(六):电影知识图谱构建 6.将关系型数据存入图数据库Neo4j](https://www.ljjyy.com/archives/2019/10/100596.html) 32 | 7. [从零开始学习知识图谱(七):百科知识图谱构建 1.百科类知识抽取](https://www.ljjyy.com/archives/2019/10/100597.html) 33 | 8. [从零开始学习知识图谱(八):百科知识图谱构建 2.数据清洗及存入图数据库Neo4j](https://www.ljjyy.com/archives/2019/10/100598.html) 34 | 9. [从零开始学习知识图谱(九):百科知识图谱构建 3.基于TensorFlow神经网络关系抽取的数据集构建(使用OpenNRE)](https://www.ljjyy.com/archives/2019/10/100599.html) 35 | 10. [从零开始学习知识图谱(十):百科知识图谱构建 4.结构化数据到RDF](https://www.ljjyy.com/archives/2019/10/100600.html) 36 | 11. [从零开始学习知识图谱(十一):百科知识图谱构建 5.Jena使用及SPARQL查询](https://www.ljjyy.com/archives/2019/10/100601.html) 37 | 12. [从零开始学习知识图谱(十二):百科知识图谱构建 6.基于Silk知识融合](https://www.ljjyy.com/archives/2019/10/100602.html) 38 | 13. [从零开始学习知识图谱(十三):百科知识图谱构建 7.基于Silk批量知识融合](https://www.ljjyy.com/archives/2019/10/100603.html) 39 | 40 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/baidu_baike/baidu_baike/__init__.py -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaiduBaikeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # Actor 15 | # 包含演员相关属性 16 | actor_id = scrapy.Field() 17 | actor_bio = scrapy.Field() 18 | actor_chName = scrapy.Field() 19 | actor_foreName = scrapy.Field() 20 | actor_nationality = scrapy.Field() 21 | actor_constellation = scrapy.Field() 22 | actor_birthPlace = scrapy.Field() 23 | actor_birthDay = scrapy.Field() 24 | actor_repWorks = scrapy.Field() 25 | actor_achiem = scrapy.Field() 26 | actor_brokerage = scrapy.Field() 27 | 28 | # movie 29 | # 电影相关属性 30 | movie_id = scrapy.Field() 31 | movie_bio = scrapy.Field() 32 | movie_chName = scrapy.Field() 33 | movie_foreName = scrapy.Field() 34 | movie_prodTime = scrapy.Field() 35 | movie_prodCompany = scrapy.Field() 36 | movie_director = scrapy.Field() 37 | movie_screenwriter = scrapy.Field() 38 | movie_genre = scrapy.Field() 39 | movie_star = scrapy.Field() 40 | movie_length = scrapy.Field() 41 | movie_rekeaseTime = scrapy.Field() 42 | movie_language = scrapy.Field() 43 | movie_achiem = scrapy.Field() 44 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | 12 | class WeixinSpiderSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.logger.info('Spider opened: %s' % spider.name) 58 | 59 | 60 | class RandomUserAgent: 61 | def __init__(self, agents): 62 | self.agents = [ 63 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 64 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 65 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 67 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 68 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 69 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 70 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 71 | ] 72 | 73 | @classmethod 74 | def from_crawler(cls, crawler): 75 | # 获取settings的USER_AGENT列表并返回 76 | return cls(crawler.settings.getlist('USER_AGENTS')) 77 | 78 | def process_request(self, request, spider): 79 | # 随机设置Request报头header的User-Agent 80 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 81 | 82 | 83 | # 添加代理 84 | 85 | class ProxyMiddleWare(object): 86 | proxy_list = [ 87 | "http://58.87.89.234:3128", 88 | "http://139.201.202.140:53281", 89 | "http://27.37.123.30:9000", 90 | "http://218.67.82.146:36709", 91 | "http://222.222.169.60:53281", 92 | "http://120.33.247.233:46884", 93 | "http://114.215.18.7:3128", 94 | "http://112.74.94.142:3128", 95 | "http://122.72.18.34:80", 96 | "http://36.33.25.123:808", 97 | "http://123.138.89.133:9999", 98 | "http://111.231.192.61:8080", 99 | "http://59.41.202.228:53281", 100 | "http://222.241.14.187:8888", 101 | "http://61.155.164.106:3128", 102 | "http://27.40.156.43:61234", 103 | "http://14.29.84.50:8080", 104 | "http://116.25.100.62:9797", 105 | "http://58.21.183.144:80", 106 | "http://14.221.166.205:9000", 107 | "http://115.231.50.10:53281", 108 | "http://120.34.205.40:808", 109 | "http://123.139.56.238:9999", 110 | "http://113.116.170.232:9000", 111 | "http://116.17.236.36:808", 112 | "http://114.232.163.73:34837", 113 | "http://171.35.103.37:808", 114 | "http://27.46.51.232:9797", 115 | "http://223.247.255.207:24714", 116 | "http://223.241.117.179:8010", 117 | "http://222.186.12.102:57624"] 118 | 119 | 120 | def process_request(self, request, spider): 121 | # if not request.meta['proxies']: 122 | ip = random.choice(self.proxy_list) 123 | request.meta['proxy'] = ip 124 | 125 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | #import sys 12 | #from importlib import reload 13 | 14 | #reload(sys) 15 | #sys.setdefaultencoding('utf-8') 16 | 17 | import pymysql 18 | from pymysql import connections 19 | from baidu_baike import settings 20 | 21 | class BaiduBaikePipeline(object): 22 | def __init__(self): 23 | # 初始化并连接到mysql数据库 24 | self.conn = pymysql.connect( 25 | host=settings.HOST_IP, 26 | port=settings.PORT, 27 | user=settings.USER, 28 | passwd=settings.PASSWD, 29 | db=settings.DB_NAME, 30 | charset='utf8mb4', 31 | use_unicode=True 32 | ) 33 | self.cursor = self.conn.cursor() 34 | 35 | def process_item(self, item, spider): 36 | # process info for actor 37 | actor_chName = str(item['actor_chName']).encode('utf-8') 38 | actor_foreName = str(item['actor_foreName']).encode('utf-8') 39 | movie_chName = str(item['movie_chName']).encode('utf-8') 40 | movie_foreName = str(item['movie_foreName']).encode('utf-8') 41 | 42 | if (item['actor_chName'] != None or item['actor_foreName'] != None) and item['movie_chName'] == None: 43 | actor_bio = str(item['actor_bio']).encode('utf-8') 44 | actor_nationality = str(item['actor_nationality']).encode('utf-8') 45 | actor_constellation = str(item['actor_constellation']).encode('utf-8') 46 | actor_birthPlace = str(item['actor_birthPlace']).encode('utf-8') 47 | actor_birthDay = str(item['actor_birthDay']).encode('utf-8') 48 | actor_repWorks = str(item['actor_repWorks']).encode('utf-8') 49 | actor_achiem = str(item['actor_achiem']).encode('utf-8') 50 | actor_brokerage = str(item['actor_brokerage']).encode('utf-8') 51 | 52 | self.cursor.execute("SELECT actor_chName FROM actor;") 53 | actorList = self.cursor.fetchall() 54 | if (actor_chName,) not in actorList : 55 | # get the nums of actor_id in table actor 56 | self.cursor.execute("SELECT MAX(actor_id) FROM actor") 57 | result = self.cursor.fetchall()[0] 58 | if None in result: 59 | actor_id = 1 60 | else: 61 | actor_id = result[0] + 1 62 | sql = """ 63 | INSERT INTO actor(actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 64 | """ 65 | self.cursor.execute(sql, (actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage )) 66 | self.conn.commit() 67 | else: 68 | print("#" * 20, "Got a duplict actor!!", actor_chName) 69 | elif (item['movie_chName'] != None or item['movie_foreName'] != None) and item['actor_chName'] == None: 70 | movie_bio = str(item['movie_bio']).encode('utf-8') 71 | movie_prodTime = str(item['movie_prodTime']).encode('utf-8') 72 | movie_prodCompany = str(item['movie_prodCompany']).encode('utf-8') 73 | movie_director = str(item['movie_director']).encode('utf-8') 74 | movie_screenwriter = str(item['movie_screenwriter']).encode('utf-8') 75 | movie_genre = str(item['movie_genre']).encode('utf-8') 76 | movie_star = str(item['movie_star']).encode('utf-8') 77 | movie_length = str(item['movie_length']).encode('utf-8') 78 | movie_rekeaseTime = str(item['movie_rekeaseTime']).encode('utf-8') 79 | movie_language = str(item['movie_language']).encode('utf-8') 80 | movie_achiem = str(item['movie_achiem']).encode('utf-8') 81 | 82 | self.cursor.execute("SELECT movie_chName FROM movie;") 83 | movieList = self.cursor.fetchall() 84 | if (movie_chName,) not in movieList : 85 | self.cursor.execute("SELECT MAX(movie_id) FROM movie") 86 | result = self.cursor.fetchall()[0] 87 | if None in result: 88 | movie_id = 1 89 | else: 90 | movie_id = result[0] + 1 91 | sql = """ 92 | INSERT INTO movie( movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 93 | """ 94 | self.cursor.execute(sql, ( movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem )) 95 | self.conn.commit() 96 | else: 97 | print("Got a duplict movie!!", movie_chName) 98 | else: 99 | print("Skip this page because wrong category!! ") 100 | return item 101 | def close_spider(self, spider): 102 | self.conn.close() 103 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidu_baike project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidu_baike' 13 | 14 | SPIDER_MODULES = ['baidu_baike.spiders'] 15 | NEWSPIDER_MODULE = 'baidu_baike.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)' 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # import random 31 | # DOWNLOAD_DELAY = random.randint(0, 1) 32 | # The download delay setting will honor only one of: 33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | # CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | # TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | # DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | # } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | # SPIDER_MIDDLEWARES = { 51 | # 'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543, 52 | # } 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | # DOWNLOADER_MIDDLEWARES = { 57 | # 'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543, 58 | # } 59 | DOWNLOADER_MIDDLEWARES = { 60 | 'baidu_baike.middlewares.RandomUserAgent': 10, 61 | 'baidu_baike.middlewares.ProxyMiddleWare': 100, 62 | } 63 | # Enable or disable extensions 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 65 | # EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | # } 68 | 69 | # Configure item pipelines 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 71 | # ITEM_PIPELINES = { 72 | # 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 73 | # } 74 | ITEM_PIPELINES = { 75 | 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 76 | } 77 | 78 | # HOST_IP = 'localhost' 79 | # PORT = 3306 80 | # USER = 'root' 81 | # PASSWD = 'root' 82 | # DB_NAME = 'kg_movie' 83 | HOST_IP = 'localhost' 84 | PORT = 3306 85 | USER = 'root' 86 | PASSWD = 'root' 87 | DB_NAME = 'baidu_baike' 88 | # Enable and configure the AutoThrottle extension (disabled by default) 89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 90 | # AUTOTHROTTLE_ENABLED = True 91 | # The initial download delay 92 | # AUTOTHROTTLE_START_DELAY = 5 93 | # The maximum download delay to be set in case of high latencies 94 | # AUTOTHROTTLE_MAX_DELAY = 60 95 | # The average number of requests Scrapy should be sending in parallel to 96 | # each remote server 97 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 98 | # Enable showing throttling stats for every response received: 99 | # AUTOTHROTTLE_DEBUG = False 100 | 101 | # Enable and configure HTTP caching (disabled by default) 102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 103 | # HTTPCACHE_ENABLED = True 104 | # HTTPCACHE_EXPIRATION_SECS = 0 105 | # HTTPCACHE_DIR = 'httpcache' 106 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 107 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 108 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/baidu_baike/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/craw/baidu_baike/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidu_baike.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidu_baike 12 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_baidu/baidu_baike/__init__.py -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_baidu/baidu_baike/commands/__init__.py -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from scrapy.commands import ScrapyCommand 5 | from scrapy.crawler import CrawlerRunner 6 | from scrapy.exceptions import UsageError 7 | from scrapy.utils.project import get_project_settings 8 | from scrapy.crawler import Crawler 9 | from scrapy.utils.conf import arglist_to_dict 10 | 11 | class Command(ScrapyCommand): 12 | 13 | requires_project = True 14 | 15 | def syntax(self): 16 | return '[options]' 17 | 18 | def short_desc(self): 19 | return 'Runs all of the spiders' 20 | 21 | def add_options(self, parser): 22 | ScrapyCommand.add_options(self, parser) 23 | parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 24 | help="set spider argument (may be repeated)") 25 | parser.add_option("-o", "--output", metavar="FILE", 26 | help="dump scraped items into FILE (use - for stdout)") 27 | parser.add_option("-t", "--output-format", metavar="FORMAT", 28 | help="format to use for dumping items with -o") 29 | 30 | def process_options(self, args, opts): 31 | ScrapyCommand.process_options(self, args, opts) 32 | try: 33 | opts.spargs = arglist_to_dict(opts.spargs) 34 | except ValueError: 35 | raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 36 | 37 | def run(self, args, opts): 38 | #settings = get_project_settings() 39 | 40 | spider_loader = self.crawler_process.spider_loader 41 | for spidername in args or spider_loader.list(): 42 | print("*********cralall spidername************" + spidername) 43 | self.crawler_process.crawl(spidername, **opts.spargs) 44 | 45 | self.crawler_process.start() 46 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaiduBaikeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | title_id = scrapy.Field() 16 | abstract = scrapy.Field() 17 | infobox = scrapy.Field() 18 | subject = scrapy.Field() 19 | disambi = scrapy.Field() 20 | redirect = scrapy.Field() 21 | curLink = scrapy.Field() 22 | interPic = scrapy.Field() 23 | interLink = scrapy.Field() 24 | exterLink = scrapy.Field() 25 | relateLemma = scrapy.Field() 26 | all_text = scrapy.Field() 27 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | class WeixinSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | class RandomUserAgent: 59 | def __init__(self, agents): 60 | self.agents =[ 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 62 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 64 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 65 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 67 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 68 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 69 | ] 70 | 71 | @classmethod 72 | def from_crawler(cls,crawler): 73 | # 获取settings的USER_AGENT列表并返回 74 | return cls(crawler.settings.getlist('USER_AGENTS')) 75 | def process_request(self, request, spider): 76 | # 随机设置Request报头header的User-Agent 77 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 78 | 79 | # 添加代理 80 | 81 | class ProxyMiddleWare(object): 82 | proxy_list=[ 83 | "http://58.87.89.234:3128", 84 | "http://139.201.202.140:53281", 85 | "http://27.37.123.30:9000", 86 | "http://218.67.82.146:36709", 87 | "http://222.222.169.60:53281", 88 | "http://120.33.247.233:46884", 89 | "http://114.215.18.7:3128", 90 | "http://112.74.94.142:3128", 91 | "http://122.72.18.34:80", 92 | "http://36.33.25.123:808", 93 | "http://123.138.89.133:9999", 94 | "http://111.231.192.61:8080", 95 | "http://59.41.202.228:53281", 96 | "http://222.241.14.187:8888", 97 | "http://61.155.164.106:3128", 98 | "http://27.40.156.43:61234", 99 | "http://14.29.84.50:8080", 100 | "http://116.25.100.62:9797", 101 | "http://58.21.183.144:80", 102 | "http://14.221.166.205:9000", 103 | "http://115.231.50.10:53281", 104 | "http://120.34.205.40:808", 105 | "http://123.139.56.238:9999", 106 | "http://113.116.170.232:9000", 107 | "http://116.17.236.36:808", 108 | "http://114.232.163.73:34837", 109 | "http://171.35.103.37:808", 110 | "http://27.46.51.232:9797", 111 | "http://223.247.255.207:24714", 112 | "http://223.241.117.179:8010", 113 | "http://222.186.12.102:57624"] 114 | 115 | def process_request(self,request,spider): 116 | # if not request.meta['proxies']: 117 | ip = random.choice(self.proxy_list) 118 | request.meta['proxy'] = ip 119 | 120 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | 12 | import pymysql 13 | from pymysql import connections 14 | from baidu_baike import settings 15 | 16 | class BaiduBaikePipeline(object): 17 | def __init__(self): 18 | self.conn = pymysql.connect( 19 | host=settings.HOST_IP, 20 | port=settings.PORT, 21 | user=settings.USER, 22 | passwd=settings.PASSWD, 23 | db=settings.DB_NAME, 24 | charset='utf8mb4', 25 | use_unicode=True 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def process_item(self, item, spider): 30 | # process info for actor 31 | title = str(item['title']).encode('utf-8') 32 | title_id = str(item['title_id']).encode('utf-8') 33 | abstract = str(item['abstract']).encode('utf-8') 34 | infobox = str(item['infobox']).encode('utf-8') 35 | subject = str(item['subject']).encode('utf-8') 36 | disambi = str(item['disambi']).encode('utf-8') 37 | redirect = str(item['redirect']).encode('utf-8') 38 | curLink = str(item['curLink']).encode('utf-8') 39 | interPic = str(item['interPic']).encode('utf-8') 40 | interLink = str(item['interLink']).encode('utf-8') 41 | exterLink = str(item['exterLink']).encode('utf-8') 42 | relateLemma = str(item['relateLemma']).encode('utf-8') 43 | all_text = str(item['all_text']).encode('utf-8') 44 | 45 | # self.cursor.execute("SELECT disambi FROM lemmas;") 46 | # disambi_list = self.cursor.fetchall() 47 | # if (disambi,) not in disambi_list : 48 | self.cursor.execute("SELECT MAX(title_id) FROM lemmas") 49 | result = self.cursor.fetchall()[0] 50 | if None in result: 51 | title_id = 1 52 | else: 53 | title_id = result[0] + 1 54 | sql = """ 55 | INSERT INTO lemmas(title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 56 | """ 57 | try: 58 | # disambi_list = self.cursor.fetchall() 59 | # if (disambi, ) in disambi_list: 60 | # print ("result: ", disambi) 61 | self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text )) 62 | self.conn.commit() 63 | # self.cursor.execute("SELECT disambi FROM lemmas" ) 64 | except Exception as e: 65 | print("#"*20, "\nAn error when insert into mysql!!\n") 66 | print("curLink: ", curLink, "\n") 67 | print(e, "\n", "#"*20) 68 | try: 69 | all_text = str('None').encode('utf-8').encode('utf-8') 70 | self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text )) 71 | self.conn.commit() 72 | except Exception as f: 73 | print("Error without all_text!!!") 74 | return item 75 | 76 | def close_spider(self, spider): 77 | self.conn.close() 78 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidu_baike project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidu_baike' 13 | 14 | SPIDER_MODULES = ['baidu_baike.spiders'] 15 | NEWSPIDER_MODULE = 'baidu_baike.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 300 26 | DOWNLOAD_TIMEOUT=30 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | #import random 32 | #DOWNLOAD_DELAY = random.randint(0, 1) 33 | # The download delay setting will honor only one of: 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 100 35 | CONCURRENT_REQUESTS_PER_IP = 100 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543, 59 | #} 60 | DOWNLOADER_MIDDLEWARES = { 61 | 'baidu_baike.middlewares.RandomUserAgent': 10, 62 | 'baidu_baike.middlewares.ProxyMiddleWare': 100, 63 | } 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 74 | #} 75 | ITEM_PIPELINES = { 76 | 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 77 | } 78 | 79 | HOST_IP = 'localhost' 80 | PORT = 3306 81 | USER = 'root' 82 | PASSWD = 'root' 83 | DB_NAME = 'baidu_duplicate' 84 | 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 87 | #AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | #AUTOTHROTTLE_START_DELAY = 5 90 | # The maximum download delay to be set in case of high latencies 91 | #AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | #AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | #HTTPCACHE_ENABLED = True 101 | #HTTPCACHE_EXPIRATION_SECS = 0 102 | #HTTPCACHE_DIR = 'httpcache' 103 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | 106 | COMMANDS_MODULE = 'baidu_baike.commands' 107 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup(name='scrapy-mymodule', 7 | entry_points={ 8 | 'scrapy.commands': [ 9 | 'crawlall=baidu_baike.commands:crawlall', 10 | ], 11 | }, 12 | ) 13 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu2' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/%E5%91%A8%E6%98%9F%E9%A9%B0/169917?fr=aladdin'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu3' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/%E4%B8%83%E5%B0%8F%E7%A6%8F'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu4' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/%E9%AB%98%E6%A3%98%E9%BE%99'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu5' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/剑龙'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu6' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/%E5%89%91%E9%BE%99/6817480#viewPageContent'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | from scrapy.utils.log import configure_logging 11 | import scrapy 12 | from scrapy.crawler import CrawlerRunner 13 | from twisted.internet import reactor 14 | from scrapy.http import Request 15 | from bs4 import BeautifulSoup 16 | import re 17 | import urllib 18 | import json 19 | 20 | class BaiduBaikeSpider(scrapy.Spider, object): 21 | name = 'baidu8' 22 | allowed_domains = ["baike.baidu.com"] 23 | start_urls = ['https://baike.baidu.com/item/%E8%9E%BA%E6%97%8B%E8%8A%A6%E8%8D%9F'] 24 | 25 | def _get_from_findall(self, tag_list): 26 | result = [] 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | # tooooo ugly,,,, but can not use defaultdict 34 | item = BaiduBaikeItem() 35 | for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']: 36 | item[sub_item] = None 37 | 38 | mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() 39 | subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() 40 | redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract() 41 | try: 42 | item['title'] = ' '.join(mainTitle) 43 | except: 44 | item['title'] = None 45 | try: 46 | item['disambi'] = ' '.join(mainTitle + subTitle) 47 | except: 48 | item['disambi'] = None 49 | try: 50 | item['redirect'] = ' '.join(redirect_name) 51 | except: 52 | item['redirect'] = None 53 | try: 54 | item['curLink'] = str(response.url) 55 | except: 56 | item['curLink'] = None 57 | 58 | soup = BeautifulSoup(response.text, 'lxml') 59 | summary_node = soup.find("div", class_ = "lemma-summary") 60 | try: 61 | item['abstract'] = summary_node.get_text().replace("\n"," ") 62 | except: 63 | item['abstract'] = None 64 | 65 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 66 | page_category = [l.strip() for l in page_category] 67 | try: 68 | item['subject'] = ','.join(page_category) 69 | except: 70 | item['subject'] = None 71 | 72 | # Get infobox 73 | all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") 74 | basic_item = self._get_from_findall(all_basicInfo_Item) 75 | basic_item = [s.strip().replace('\n', ' ') for s in basic_item] 76 | all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" ) 77 | basic_value = self._get_from_findall(all_basicInfo_value) 78 | basic_value = [s.strip().replace(u'收起', '') for s in basic_value] 79 | info_dict = {} 80 | for i, info in enumerate(basic_item): 81 | info_dict[info] = basic_value[i] 82 | try: 83 | item['infobox'] = json.dumps(info_dict) 84 | except: 85 | item['infobox'] = None 86 | 87 | # Get inter picture 88 | selector = scrapy.Selector(response) 89 | img_path = selector.xpath("//img[@class='picture']/@src").extract() 90 | try: 91 | item['interPic'] = ','.join(img_path) 92 | except: 93 | item['interPic'] = None 94 | 95 | inter_links_dict = {} 96 | soup = BeautifulSoup(response.text, 'lxml') 97 | inter_links = soup.find_all('a', href=re.compile(r"/item/")) 98 | for link in inter_links: 99 | new_url = link["href"] 100 | url_name = link.get_text() 101 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 102 | inter_links_dict[url_name] = new_full_url 103 | try: 104 | item['interLink'] = json.dumps(inter_links_dict) 105 | except: 106 | item['interLink'] = None 107 | 108 | exter_links_dict = {} 109 | soup = BeautifulSoup(response.text, 'lxml') 110 | exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) 111 | for link in exterLink_links: 112 | new_url = link["href"] 113 | url_name = link.get_text() 114 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 115 | exter_links_dict[url_name] = new_full_url 116 | try: 117 | item['exterLink'] = json.dumps(exter_links_dict) 118 | except: 119 | item['exterLink'] = None 120 | 121 | all_para = soup.find_all('div',class_="para") 122 | all_text = [para.get_text() for para in all_para] 123 | try: 124 | item['all_text'] = ' '.join(all_text) 125 | except: 126 | item['all_text'] = None 127 | 128 | yield item 129 | 130 | soup = BeautifulSoup(response.text, 'lxml') 131 | links = soup.find_all('a', href=re.compile(r"/item/")) 132 | for link in links: 133 | new_url = link["href"] 134 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 135 | yield scrapy.Request(new_full_url, callback=self.parse) 136 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/creat_mysql.md: -------------------------------------------------------------------------------- 1 | DROP DATABASE baidu_duplicate; 2 | 3 | CREATE DATABASE baidu_duplicate; 4 | 5 | USE baidu_duplicate; 6 | 7 | CREATE TABLE lemmas( title VARCHAR(100), title_id INT NOT NULL, abstract TEXT, infobox TEXT, subject VARCHAR(100), disambi VARCHAR(100), redirect VARCHAR(100), curLink TEXT, interPic TEXT, interLink TEXT, exterLink TEXT, relateLemma TEXT, all_text TEXT, PRIMARY KEY(title_id)); 8 | 9 | ALTER TABLE lemmas CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 10 | 11 | ALTER table lemmas ADD INDEX title_index(title); 12 | 13 | ALTER table lemmas ADD INDEX subject_index(subject); 14 | 15 | ALTER table lemmas ADD INDEX disambi_index(disambi); 16 | -------------------------------------------------------------------------------- /ie/craw/craw_all_baidu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidu_baike.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidu_baike 12 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_hudong/craw_all_hudong/__init__.py -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_hudong/craw_all_hudong/commands/__init__.py -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from scrapy.commands import ScrapyCommand 5 | from scrapy.crawler import CrawlerRunner 6 | from scrapy.exceptions import UsageError 7 | from scrapy.utils.project import get_project_settings 8 | from scrapy.crawler import Crawler 9 | from scrapy.utils.conf import arglist_to_dict 10 | 11 | class Command(ScrapyCommand): 12 | 13 | requires_project = True 14 | 15 | def syntax(self): 16 | return '[options]' 17 | 18 | def short_desc(self): 19 | return 'Runs all of the spiders' 20 | 21 | def add_options(self, parser): 22 | ScrapyCommand.add_options(self, parser) 23 | parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 24 | help="set spider argument (may be repeated)") 25 | parser.add_option("-o", "--output", metavar="FILE", 26 | help="dump scraped items into FILE (use - for stdout)") 27 | parser.add_option("-t", "--output-format", metavar="FORMAT", 28 | help="format to use for dumping items with -o") 29 | 30 | def process_options(self, args, opts): 31 | ScrapyCommand.process_options(self, args, opts) 32 | try: 33 | opts.spargs = arglist_to_dict(opts.spargs) 34 | except ValueError: 35 | raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 36 | 37 | def run(self, args, opts): 38 | #settings = get_project_settings() 39 | 40 | spider_loader = self.crawler_process.spider_loader 41 | for spidername in args or spider_loader.list(): 42 | print("*********cralall spidername************" + spidername) 43 | self.crawler_process.crawl(spidername, **opts.spargs) 44 | 45 | self.crawler_process.start() 46 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CrawAllHudongItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | title_id = scrapy.Field() 16 | abstract = scrapy.Field() 17 | infobox = scrapy.Field() 18 | subject = scrapy.Field() 19 | disambi = scrapy.Field() 20 | redirect = scrapy.Field() 21 | curLink = scrapy.Field() 22 | interPic = scrapy.Field() 23 | interLink = scrapy.Field() 24 | exterLink = scrapy.Field() 25 | relateLemma = scrapy.Field() 26 | all_text = scrapy.Field() 27 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | class HuDongSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | class RandomUserAgent: 59 | def __init__(self, agents): 60 | self.agents =[ 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 62 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 64 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 65 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 67 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 68 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 69 | ] 70 | 71 | @classmethod 72 | def from_crawler(cls,crawler): 73 | # 获取settings的USER_AGENT列表并返回 74 | return cls(crawler.settings.getlist('USER_AGENTS')) 75 | def process_request(self, request, spider): 76 | # 随机设置Request报头header的User-Agent 77 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 78 | 79 | # 添加代理 80 | 81 | class ProxyMiddleWare(object): 82 | proxy_list=[ 83 | "http://58.87.89.234:3128", 84 | "http://139.201.202.140:53281", 85 | "http://27.37.123.30:9000", 86 | "http://218.67.82.146:36709", 87 | "http://222.222.169.60:53281", 88 | "http://120.33.247.233:46884", 89 | "http://114.215.18.7:3128", 90 | "http://112.74.94.142:3128", 91 | "http://122.72.18.34:80", 92 | "http://36.33.25.123:808", 93 | "http://123.138.89.133:9999", 94 | "http://111.231.192.61:8080", 95 | "http://59.41.202.228:53281", 96 | "http://222.241.14.187:8888", 97 | "http://61.155.164.106:3128", 98 | "http://27.40.156.43:61234", 99 | "http://14.29.84.50:8080", 100 | "http://116.25.100.62:9797", 101 | "http://58.21.183.144:80", 102 | "http://14.221.166.205:9000", 103 | "http://115.231.50.10:53281", 104 | "http://120.34.205.40:808", 105 | "http://123.139.56.238:9999", 106 | "http://113.116.170.232:9000", 107 | "http://116.17.236.36:808", 108 | "http://114.232.163.73:34837", 109 | "http://171.35.103.37:808", 110 | "http://27.46.51.232:9797", 111 | "http://223.247.255.207:24714", 112 | "http://223.241.117.179:8010", 113 | "http://222.186.12.102:57624"] 114 | 115 | def process_request(self,request,spider): 116 | # if not request.meta['proxies']: 117 | ip = random.choice(self.proxy_list) 118 | request.meta['proxy'] = ip 119 | 120 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | 12 | import pymysql 13 | from pymysql import connections 14 | from craw_all_hudong import settings 15 | 16 | class CrawAllHudongPipeline(object): 17 | def __init__(self): 18 | self.conn = pymysql.connect( 19 | host=settings.HOST_IP, 20 | port=settings.PORT, 21 | user=settings.USER, 22 | passwd=settings.PASSWD, 23 | db=settings.DB_NAME, 24 | charset='utf8mb4', 25 | use_unicode=True 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def process_item(self, item, spider): 30 | # process info for actor 31 | title = str(item['title']).encode('utf-8') 32 | title_id = str(item['title_id']).encode('utf-8') 33 | abstract = str(item['abstract']).encode('utf-8') 34 | infobox = str(item['infobox']).encode('utf-8') 35 | subject = str(item['subject']).encode('utf-8') 36 | disambi = str(item['disambi']).encode('utf-8') 37 | redirect = str(item['redirect']).encode('utf-8') 38 | curLink = str(item['curLink']).encode('utf-8') 39 | interPic = str(item['interPic']).encode('utf-8') 40 | interLink = str(item['interLink']).encode('utf-8') 41 | exterLink = str(item['exterLink']).encode('utf-8') 42 | relateLemma = str(item['relateLemma']).encode('utf-8') 43 | all_text = str(item['all_text']).encode('utf-8') 44 | 45 | # self.cursor.execute("SELECT disambi FROM lemmas;") 46 | # disambi_list = self.cursor.fetchall() 47 | # if (disambi,) not in disambi_list : 48 | self.cursor.execute("SELECT MAX(title_id) FROM lemmas") 49 | result = self.cursor.fetchall()[0] 50 | if None in result: 51 | title_id = 1 52 | else: 53 | title_id = result[0] + 1 54 | sql = """ 55 | INSERT INTO lemmas(title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 56 | """ 57 | try: 58 | # disambi_list = self.cursor.fetchall() 59 | # if (disambi, ) in disambi_list: 60 | # print ("result: ", disambi) 61 | self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text )) 62 | self.conn.commit() 63 | # self.cursor.execute("SELECT disambi FROM lemmas" ) 64 | except Exception as e: 65 | print("#"*20, "\nAn error when insert into mysql!!\n") 66 | print("curLink: ", curLink, "\n") 67 | print(e, "\n", "#"*20) 68 | try: 69 | all_text = str('None').encode('utf-8').encode('utf-8') 70 | self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text )) 71 | self.conn.commit() 72 | except Exception as f: 73 | print("Error beyond all_text!!!") 74 | return item 75 | 76 | def close_spider(self, spider): 77 | self.conn.close() 78 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for craw_all_hudong project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'craw_all_hudong' 13 | 14 | SPIDER_MODULES = ['craw_all_hudong.spiders'] 15 | NEWSPIDER_MODULE = 'craw_all_hudong.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'craw_all_hudong (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 64 26 | DOWNLOAD_TIMEOUT=30 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | #import random 32 | #DOWNLOAD_DELAY = random.randint(0, 1) 33 | # The download delay setting will honor only one of: 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 30 35 | CONCURRENT_REQUESTS_PER_IP = 30 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'craw_all_hudong.middlewares.hudongBaikeSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'craw_all_hudong.middlewares.hudongBaikeDownloaderMiddleware': 543, 59 | #} 60 | DOWNLOADER_MIDDLEWARES = { 61 | 'craw_all_hudong.middlewares.RandomUserAgent': 10, 62 | 'craw_all_hudong.middlewares.ProxyMiddleWare': 100, 63 | } 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'craw_all_hudong.pipelines.hudongBaikePipeline': 300, 74 | #} 75 | ITEM_PIPELINES = { 76 | 'craw_all_hudong.pipelines.CrawAllHudongPipeline': 300, 77 | } 78 | 79 | HOST_IP = 'localhost' 80 | PORT = 3306 81 | USER = 'root' 82 | PASSWD = 'root' 83 | DB_NAME = 'hudong_fenlei' 84 | 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 87 | #AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | #AUTOTHROTTLE_START_DELAY = 5 90 | # The maximum download delay to be set in case of high latencies 91 | #AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | #AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | #HTTPCACHE_ENABLED = True 101 | #HTTPCACHE_EXPIRATION_SECS = 0 102 | #HTTPCACHE_DIR = 'httpcache' 103 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | 106 | COMMANDS_MODULE = 'craw_all_hudong.commands' 107 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup(name='scrapy-mymodule', 7 | entry_points={ 8 | 'scrapy.commands': [ 9 | 'crawlall=craw_all_hudong.commands:crawlall', 10 | ], 11 | }, 12 | ) 13 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/craw_all_hudong/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/creat_mysql.md: -------------------------------------------------------------------------------- 1 | #DROP DATABASE hudong_duplicate; 2 | 3 | CREATE DATABASE hudong_fenlei; 4 | 5 | USE hudong_fenlei; 6 | 7 | CREATE TABLE lemmas( title VARCHAR(100), title_id INT NOT NULL, abstract TEXT, infobox TEXT, subject VARCHAR(500), disambi VARCHAR(100), redirect VARCHAR(100), curLink TEXT, interPic TEXT, interLink TEXT, exterLink TEXT, relateLemma TEXT, all_text TEXT, PRIMARY KEY(title_id)); 8 | 9 | ALTER TABLE lemmas CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 10 | 11 | ALTER table lemmas ADD INDEX title_index(title); 12 | 13 | #ALTER table lemmas ADD INDEX subject_index(subject); 14 | 15 | ALTER table lemmas ADD INDEX disambi_index(disambi); 16 | -------------------------------------------------------------------------------- /ie/craw/craw_all_hudong/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = craw_all_hudong.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = craw_all_hudong 12 | -------------------------------------------------------------------------------- /ie/craw/craw_without_spider/mysql/creat_sql.txt: -------------------------------------------------------------------------------- 1 | # Commands to creat mysql database and tables. This database includes actors and films. 2 | 3 | # 演员 : ID, 简介, 中文名,外文名,国籍,星座,出生地,出生日期,代表作品,主要成就,经纪公司; 4 | # actor: actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage; 5 | 6 | # 电影作品:ID,简介,中文名,外文名,出品时间,出品公司,导演,编剧,类型,主演,片长,上映时间,对白语言,主要成就; 7 | # movie: movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem; 8 | 9 | # 电影类型:爱情,喜剧,动作,剧情,科幻,恐怖,动画,惊悚,犯罪,冒险,其他; 10 | # genre: genre_id, genre_name 11 | 12 | # 演员->电影: 演员ID, 电影ID; 13 | # actor_to_movie: actor_id, movie_id; 14 | 15 | # 电影-> 类型: 电影ID, 类型ID 16 | # movie_to_genre: movie_id, genre_id 17 | 18 | CREATE DATABASE kg_movie; 19 | USE kg_movie; 20 | 21 | CREATE TABLE actor( actor_id INT NOT NULL, actor_bio TEXT, actor_chName VARCHAR(100), actor_foreName VARCHAR(100), actor_nationality VARCHAR(100), actor_constellation VARCHAR(100), actor_birthPlace VARCHAR(100), actor_birthDay VARCHAR(100), actor_repWorks VARCHAR(100), actor_achiem TEXT, actor_brokerage VARCHAR(100), PRIMARY KEY(actor_id) ); 22 | 23 | CREATE TABLE movie( movie_id INT NOT NULL, movie_bio TEXT, movie_chName VARCHAR(100), movie_foreName VARCHAR(100), movie_prodTime VARCHAR(100), movie_prodCompany VARCHAR(100), movie_director VARCHAR(100), movie_screenwriter VARCHAR(100), movie_genre VARCHAR(100), movie_star VARCHAR(100), movie_length VARCHAR(100), movie_rekeaseTime VARCHAR(100), movie_language VARCHAR(100), movie_achiem TEXT, PRIMARY KEY(movie_id) ); 24 | 25 | CREATE TABLE actor_to_movie( actor_movie_id INT NOT NULL, actor_id INT NOT NULL, movie_id INT NOT NULL, PRIMARY KEY(actor_movie_id) ); 26 | 27 | CREATE TABLE genre ( genre_id INT NOT NULL, genre_name VARCHAR(100), PRIMARY KEY(genre_id) ); 28 | # Set char Set 29 | ALTER TABLE actor CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 30 | ALTER TABLE movie CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 31 | ALTER TABLE genre CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 32 | 33 | INSERT INTO genre (genre_id, genre_name) VALUES (0, '爱情'), (1, '喜剧'), (2, '动作'), (3, '剧情'), (4, '科幻'), (5, '恐怖'), (6, '动画'), (7, '惊悚'), (8, '犯罪'), (9, '冒险'), (10, '其他'); 34 | 35 | CREATE TABLE movie_to_genre( movie_genre_id INT NOT NULL, movie_id INT NOT NULL, genre_id INT NOT NULL, PRIMARY KEY(movie_genre_id) ); 36 | -------------------------------------------------------------------------------- /ie/craw/craw_without_spider/mysql/help_mysql.txt: -------------------------------------------------------------------------------- 1 | # 修改mysql 中默认字符集到utf8 2 | 3 | ALTER TABLE table_name CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci; 4 | 5 | 查看自己的字符集 6 | 7 | SHOW FULL COLUMNS FROM table_name; 8 | 9 | 更改表中某一属性的类型:把actor_achiem 变为TEXT 10 | 11 | ALTER TABLE actor CHANGE actor_achiem actor_achiem TEXT; 12 | -------------------------------------------------------------------------------- /ie/craw/craw_without_spider/utils/basic_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | ''' 5 | 包含各个表的属性定义等和程序逻辑无关的部分 6 | ''' 7 | 8 | insert_actor_command = 'INSERT INTO actor (actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ' 9 | insert_movie_command = 'INSERT INTO movie (movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) ' 10 | insert_actor_movie_command = 'INSERT INTO actor_to_movie (actor_movie_id, actor_id, movie_id ) VALUES (%s, %s, %s ) ' 11 | insert_movie_genre_command = 'INSERT INTO movie_to_genre (movie_genre_id, movie_id, genre_id ) VALUES (%s, %s, %s ) ' # id 是整数,pymysql不支持%i %d这种,都用%s 12 | 13 | search_actor_id = 'SELECT actor_id FROM actor WHERE actor_chName= "%s" ' 14 | search_movie_id = 'SELECT movie_id FROM movie WHERE movie_chName= "%s" ' 15 | 16 | get_largest_amid = 'SELECT max(actor_movie_id) FROM actor_to_movie ' 17 | get_largest_mgid = 'SELECT max(movie_genre_id) FROM movie_to_genre ' 18 | 19 | actor_attr = { 20 | u'id' : int, 21 | u'简介': None, 22 | u'中文名': None, 23 | u'外文名': None, 24 | u'国籍': None, 25 | u'星座': None, 26 | u'出生地': None, 27 | u'出生日期': None, 28 | u'代表作品': None, 29 | u'主要成就' : None, 30 | u'经纪公司': None 31 | } 32 | actor_info = [u'id', u'简介', u'中文名', u'外文名', u'国籍', u'星座', u'出生地', u'出生日期', u'代表作品', u'主要成就', u'经纪公司'] 33 | 34 | 35 | movie_attr = { 36 | u'id' : int, 37 | u'简介': None, 38 | u'中文名': None, 39 | u'外文名': None, 40 | u'出品时间': None, 41 | u'出品公司': None, 42 | u'导演': None, 43 | u'编剧': None, 44 | u'类型': None, 45 | u'主演' : None, 46 | u'片长': None, 47 | u'上映时间': None, 48 | u'对白语言': None, 49 | u'主要成就': None 50 | } 51 | movie_info = [u'id', u'简介', u'中文名', u'外文名', u'出品时间', u'出品公司', u'导演', u'编剧', u'类型', u'主演', u'片长', u'上映时间', u'对白语言', u'主要成就' ] 52 | 53 | movie_genre = { 54 | u'爱情': 0, 55 | u'喜剧': 1, 56 | u'动作': 2, 57 | u'剧情': 3, 58 | u'科幻': 4, 59 | u'恐怖': 5, 60 | u'动画': 6, 61 | u'惊悚': 7, 62 | u'犯罪': 8, 63 | u'冒险': 9, 64 | u'其他': 10 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/hudong_baike/hudong_baike/__init__.py -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class HudongBaikeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # Actor 15 | actor_id = scrapy.Field() 16 | actor_bio = scrapy.Field() 17 | actor_chName = scrapy.Field() 18 | actor_foreName = scrapy.Field() 19 | actor_nationality = scrapy.Field() 20 | actor_constellation = scrapy.Field() 21 | actor_birthPlace = scrapy.Field() 22 | actor_birthDay = scrapy.Field() 23 | actor_repWorks = scrapy.Field() 24 | actor_achiem = scrapy.Field() 25 | actor_brokerage = scrapy.Field() 26 | 27 | # movie 28 | 29 | movie_id = scrapy.Field() 30 | movie_bio = scrapy.Field() 31 | movie_chName = scrapy.Field() 32 | movie_foreName = scrapy.Field() 33 | movie_prodTime = scrapy.Field() 34 | movie_prodCompany = scrapy.Field() 35 | movie_director = scrapy.Field() 36 | movie_screenwriter = scrapy.Field() 37 | movie_genre = scrapy.Field() 38 | movie_star = scrapy.Field() 39 | movie_length = scrapy.Field() 40 | movie_rekeaseTime = scrapy.Field() 41 | movie_language = scrapy.Field() 42 | movie_achiem = scrapy.Field() 43 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | class WeixinSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | class RandomUserAgent: 59 | def __init__(self, agents): 60 | self.agents =[ 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 62 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 64 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 65 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 67 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 68 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 69 | ] 70 | 71 | @classmethod 72 | def from_crawler(cls,crawler): 73 | # 获取settings的USER_AGENT列表并返回 74 | return cls(crawler.settings.getlist('USER_AGENTS')) 75 | def process_request(self, request, spider): 76 | # 随机设置Request报头header的User-Agent 77 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 78 | 79 | # 添加代理 80 | 81 | class ProxyMiddleWare(object): 82 | proxy_list=[ 83 | "http://58.87.89.234:3128", 84 | "http://139.201.202.140:53281", 85 | "http://27.37.123.30:9000", 86 | "http://218.67.82.146:36709", 87 | "http://222.222.169.60:53281", 88 | "http://120.33.247.233:46884", 89 | "http://114.215.18.7:3128", 90 | "http://112.74.94.142:3128", 91 | "http://122.72.18.34:80", 92 | "http://36.33.25.123:808", 93 | "http://123.138.89.133:9999", 94 | "http://111.231.192.61:8080", 95 | "http://59.41.202.228:53281", 96 | "http://222.241.14.187:8888", 97 | "http://61.155.164.106:3128", 98 | "http://27.40.156.43:61234", 99 | "http://14.29.84.50:8080", 100 | "http://116.25.100.62:9797", 101 | "http://58.21.183.144:80", 102 | "http://14.221.166.205:9000", 103 | "http://115.231.50.10:53281", 104 | "http://120.34.205.40:808", 105 | "http://123.139.56.238:9999", 106 | "http://113.116.170.232:9000", 107 | "http://116.17.236.36:808", 108 | "http://114.232.163.73:34837", 109 | "http://171.35.103.37:808", 110 | "http://27.46.51.232:9797", 111 | "http://223.247.255.207:24714", 112 | "http://223.241.117.179:8010", 113 | "http://222.186.12.102:57624"] 114 | 115 | def process_request(self,request,spider): 116 | # if not request.meta['proxies']: 117 | ip = random.choice(self.proxy_list) 118 | request.meta['proxy'] = ip 119 | 120 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import pymysql 12 | from hudong_baike import settings 13 | 14 | 15 | class HudongBaikePipeline(object): 16 | def __init__(self): 17 | self.conn = pymysql.connect( 18 | host=settings.HOST_IP, 19 | # port=settings.PORT, 20 | user=settings.USER, 21 | passwd=settings.PASSWD, 22 | db=settings.DB_NAME, 23 | charset='utf8mb4', 24 | use_unicode=True 25 | ) 26 | self.cursor = self.conn.cursor() 27 | 28 | def process_item(self, item, spider): 29 | # process info for actor 30 | actor_chName = str(item['actor_chName']).encode('utf-8') 31 | actor_foreName = str(item['actor_foreName']).encode('utf-8') 32 | movie_chName = str(item['movie_chName']).encode('utf-8') 33 | movie_foreName = str(item['movie_foreName']).encode('utf-8') 34 | 35 | if (item['actor_chName'] != None or item['actor_foreName'] != None) and item['movie_chName'] == None: 36 | actor_bio = str(item['actor_bio']).encode('utf-8') 37 | actor_nationality = str(item['actor_nationality']).encode('utf-8') 38 | actor_constellation = str(item['actor_constellation']).encode('utf-8') 39 | actor_birthPlace = str(item['actor_birthPlace']).encode('utf-8') 40 | actor_birthDay = str(item['actor_birthDay']).encode('utf-8') 41 | actor_repWorks = str(item['actor_repWorks']).encode('utf-8') 42 | actor_achiem = str(item['actor_achiem']).encode('utf-8') 43 | actor_brokerage = str(item['actor_brokerage']).encode('utf-8') 44 | 45 | self.cursor.execute("SELECT actor_chName FROM actor;") 46 | actorList = self.cursor.fetchall() 47 | if (actor_chName,) not in actorList: 48 | # get the nums of actor_id in table actor 49 | self.cursor.execute("SELECT MAX(actor_id) FROM actor") 50 | result = self.cursor.fetchall()[0] 51 | if None in result: 52 | actor_id = 1 53 | else: 54 | actor_id = result[0] + 1 55 | sql = """ 56 | INSERT INTO actor(actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 57 | """ 58 | self.cursor.execute(sql, ( 59 | actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, 60 | actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage)) 61 | self.conn.commit() 62 | else: 63 | print("#" * 20, "Got a duplict actor!!", actor_chName) 64 | elif (item['movie_chName'] != None or item['movie_foreName'] != None) and item['actor_chName'] == None: 65 | movie_bio = str(item['movie_bio']).encode('utf-8') 66 | movie_prodTime = str(item['movie_prodTime']).encode('utf-8') 67 | movie_prodCompany = str(item['movie_prodCompany']).encode('utf-8') 68 | movie_director = str(item['movie_director']).encode('utf-8') 69 | movie_screenwriter = str(item['movie_screenwriter']).encode('utf-8') 70 | movie_genre = str(item['movie_genre']).encode('utf-8') 71 | movie_star = str(item['movie_star']).encode('utf-8') 72 | movie_length = str(item['movie_length']).encode('utf-8') 73 | movie_rekeaseTime = str(item['movie_rekeaseTime']).encode('utf-8') 74 | movie_language = str(item['movie_language']).encode('utf-8') 75 | movie_achiem = str(item['movie_achiem']).encode('utf-8') 76 | 77 | self.cursor.execute("SELECT movie_chName FROM movie;") 78 | movieList = self.cursor.fetchall() 79 | if (movie_chName,) not in movieList: 80 | self.cursor.execute("SELECT MAX(movie_id) FROM movie") 81 | result = self.cursor.fetchall()[0] 82 | if None in result: 83 | movie_id = 1 84 | else: 85 | movie_id = result[0] + 1 86 | sql = """ 87 | INSERT INTO movie( movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 88 | """ 89 | self.cursor.execute(sql, ( 90 | movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, 91 | movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, 92 | movie_achiem)) 93 | self.conn.commit() 94 | else: 95 | print("Got a duplict movie!!", movie_chName) 96 | else: 97 | print("Skip this page because wrong category!! ") 98 | return item 99 | 100 | def close_spider(self, spider): 101 | self.conn.close() 102 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for hudong_baike project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'hudong_baike' 13 | 14 | SPIDER_MODULES = ['hudong_baike.spiders'] 15 | NEWSPIDER_MODULE = 'hudong_baike.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'hudong_baike (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | import random 32 | DOWNLOAD_DELAY = random.randint(0, 1) 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'hudong_baike.middlewares.HudongBaikeSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'hudong_baike.middlewares.HudongBaikeDownloaderMiddleware': 543, 59 | #} 60 | DOWNLOADER_MIDDLEWARES = { 61 | 'hudong_baike.middlewares.RandomUserAgent': 10, 62 | 'hudong_baike.middlewares.ProxyMiddleWare': 100, 63 | } 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'hudong_baike.pipelines.HudongBaikePipeline': 300, 74 | #} 75 | ITEM_PIPELINES = { 76 | 'hudong_baike.pipelines.HudongBaikePipeline': 300, 77 | } 78 | 79 | HOST_IP = 'localhost' 80 | PORT = '3306' 81 | USER = 'root' 82 | PASSWD = 'root' 83 | DB_NAME = 'kg_movie' 84 | 85 | # Enable and configure the AutoThrottle extension (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 87 | #AUTOTHROTTLE_ENABLED = True 88 | # The initial download delay 89 | #AUTOTHROTTLE_START_DELAY = 5 90 | # The maximum download delay to be set in case of high latencies 91 | #AUTOTHROTTLE_MAX_DELAY = 60 92 | # The average number of requests Scrapy should be sending in parallel to 93 | # each remote server 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 95 | # Enable showing throttling stats for every response received: 96 | #AUTOTHROTTLE_DEBUG = False 97 | 98 | # Enable and configure HTTP caching (disabled by default) 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 100 | #HTTPCACHE_ENABLED = True 101 | #HTTPCACHE_EXPIRATION_SECS = 0 102 | #HTTPCACHE_DIR = 'httpcache' 103 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 105 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/hudong_baike/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/craw/hudong_baike/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = hudong_baike.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = hudong_baike 12 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news/__init__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/news_spider/news/__init__ -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/news_spider/news_spider/__init__.py -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NewsSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() # 标题 15 | descr = scrapy.Field() # 简述 16 | auth = scrapy.Field() # 作者 17 | post_time = scrapy.Field() # 发布时间 18 | main_news = scrapy.Field() # 新闻内容 19 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class NewsSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class NewsSpiderPipeline(object): 10 | 11 | def process_item(self, item, spider): 12 | self.news = open("./news/" + item["title"].strip()+ item["post_time"] + ".txt", "w") 13 | self.news.write(item["title"].encode("utf-8") + "\n") 14 | self.news.write(item["auth"].encode("utf-8") + "\n") 15 | self.news.write(item["post_time"].encode("utf-8") + "\n") 16 | self.news.write(item["descr"].encode("utf-8") + "\n") 17 | self.news.write(item["main_news"].encode("utf-8") + "\n") 18 | 19 | return item 20 | 21 | 22 | def spider_closed(self): 23 | self.news.close() 24 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for news_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'news_spider' 13 | 14 | SPIDER_MODULES = ['news_spider.spiders'] 15 | NEWSPIDER_MODULE = 'news_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'news_spider (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = True 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'news_spider.middlewares.NewsSpiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'news_spider.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'news_spider.pipelines.NewsSpiderPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/craw/news_spider/news_spider/spiders/huxiu_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import scrapy 5 | from news_spider.items import NewsSpiderItem 6 | 7 | class HuxiuSpider(scrapy.Spider): 8 | name = "huxiu" 9 | allowed_domains = ["huxiu.com"] 10 | start_urls = ["http://www.huxiu.com"] 11 | 12 | 13 | def parse(self, response): 14 | 15 | print "Start............................" 16 | self.desc = '' 17 | for sel in response.xpath('//div[@class="mod-b mod-art clearfix "]'): 18 | item = NewsSpiderItem() 19 | item['title'] = sel.xpath('./div/h2/a[@class="transition msubstr-row2"]/text()')[0].extract() 20 | self.desc = sel.xpath('./div[@class="mob-ctt index-article-list-yh"]/div[@class="mob-sub"]/text()')[0].extract() 21 | link = sel.xpath('./div/h2/a/@href')[0].extract() 22 | url = response.urljoin(link) 23 | 24 | yield scrapy.Request(url, callback=self.parse_article ) 25 | 26 | def parse_article(self, response): 27 | detail = response.xpath('//div[@class="article-wrap"]') 28 | item = NewsSpiderItem() 29 | item['title'] = detail.xpath('./h1[@class="t-h1"]/text()')[0].extract() 30 | item['auth'] = u"作者:" + detail.xpath('./div/span[@class="author-name"]/a/text()')[0].extract() 31 | item['post_time'] = u"发表时间:" + detail.xpath('./div/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()')[0].extract() 32 | item['descr'] = u"简述:" + self.desc + "\n" # 简述存在错误 33 | all_pars = detail.xpath('//div[@class="article-content-wrap"]//p/text()').extract() 34 | 35 | content = '' 36 | for par in all_pars: 37 | content = content + par + "\n" 38 | 39 | desc = item.get('main_news') 40 | if desc == None: 41 | item['main_news'] = content 42 | else: 43 | item['main_news'] = desc + content 44 | 45 | yield item 46 | 47 | 48 | -------------------------------------------------------------------------------- /ie/craw/news_spider/readme.md: -------------------------------------------------------------------------------- 1 | 虎嗅网爬虫 2 | 3 | 对于加载更多的内容部分爬取还没有完成 4 | -------------------------------------------------------------------------------- /ie/craw/news_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = news_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = news_spider 12 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weixin_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weixin_spider 12 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/weixin_spider/weixin_spider/__init__.py -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeixinSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() # 文章标题 15 | publishTime = scrapy.Field() # 发布时间 16 | publicName = scrapy.Field() # 公众号名字 17 | article = scrapy.Field() # 文章内容 18 | cite = scrapy.Field() # 文章引用来源 19 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | class WeixinSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | class RandomUserAgent: 59 | def __init__(self, agents): 60 | self.agents =[ 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 62 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 64 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 65 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 67 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 68 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 69 | ] 70 | 71 | @classmethod 72 | def from_crawler(cls,crawler): 73 | # 获取settings的USER_AGENT列表并返回 74 | return cls(crawler.settings.getlist('USER_AGENTS')) 75 | def process_request(self, request, spider): 76 | # 随机设置Request报头header的User-Agent 77 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 78 | 79 | # 添加代理 80 | 81 | class ProxyMiddleWare(object): 82 | proxy_list=[ 83 | "http://58.87.89.234:3128", 84 | "http://139.201.202.140:53281", 85 | "http://27.37.123.30:9000", 86 | "http://218.67.82.146:36709", 87 | "http://222.222.169.60:53281", 88 | "http://120.33.247.233:46884", 89 | "http://114.215.18.7:3128", 90 | "http://112.74.94.142:3128", 91 | "http://122.72.18.34:80", 92 | "http://36.33.25.123:808", 93 | "http://123.138.89.133:9999", 94 | "http://111.231.192.61:8080", 95 | "http://59.41.202.228:53281", 96 | "http://222.241.14.187:8888", 97 | "http://61.155.164.106:3128", 98 | "http://27.40.156.43:61234", 99 | "http://14.29.84.50:8080", 100 | "http://116.25.100.62:9797", 101 | "http://58.21.183.144:80", 102 | "http://14.221.166.205:9000", 103 | "http://115.231.50.10:53281", 104 | "http://120.34.205.40:808", 105 | "http://123.139.56.238:9999", 106 | "http://113.116.170.232:9000", 107 | "http://116.17.236.36:808", 108 | "http://114.232.163.73:34837", 109 | "http://171.35.103.37:808", 110 | "http://27.46.51.232:9797", 111 | "http://223.247.255.207:24714", 112 | "http://223.241.117.179:8010", 113 | "http://222.186.12.102:57624"] 114 | 115 | def process_request(self,request,spider): 116 | # if not request.meta['proxies']: 117 | ip = random.choice(self.proxy_list) 118 | request.meta['proxy'] = ip 119 | 120 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | from pymysql import connections 10 | from weixin_spider import settings 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | class WeixinSpiderPipeline(object): 16 | def __init__(self): 17 | self.conn = pymysql.connect( 18 | host=settings.HOST_IP, 19 | # port=settings.PORT, 20 | user=settings.USER, 21 | passwd=settings.PASSWD, 22 | db=settings.DB_NAME, 23 | charset='utf8mb4', 24 | use_unicode=True 25 | ) 26 | self.cursor = self.conn.cursor() 27 | 28 | def process_item(self, item, spider): 29 | title = str(item['title']).decode('utf-8') 30 | publishTime = str(item['publishTime']).decode('utf-8') 31 | article = str(item['article']).decode('utf-8') 32 | publicName = str(item['publicName']).decode('utf-8') 33 | cite = str(item['cite']).decode('utf-8') 34 | 35 | # 查询数据库,获取当前存在的文章标题,防止重复存入,但查表浪费时间 36 | self.cursor.execute("SELECT title FROM weixin_xiaoshuo;") 37 | titleList = self.cursor.fetchall() 38 | titleStr = ''.join(map(str, titleList)) 39 | 40 | self.cursor.execute("SELECT publicName FROM weixin_xiaoshuo;") 41 | nameList = self.cursor.fetchall() 42 | nameStr = ''.join(map(str, nameList)) 43 | 44 | if titleStr.find(title) == -1 and nameStr.find(publicName) == -1: 45 | # 执行SQL插入语句 46 | sql = """ 47 | INSERT INTO weixin_xiaoshuo( title, publishTime, article, publicName, cite) VALUES (%s, %s, %s, %s, %s) 48 | """ 49 | self.cursor.execute(sql, (title, publishTime, article, publicName, cite)) 50 | self.conn.commit() 51 | else: 52 | print "该文章已经存在于数据库中:", title.encode('utf-8') 53 | return item 54 | 55 | def close_spider(self, spider): 56 | self.conn.close() 57 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for weixin_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'weixin_spider' 13 | 14 | SPIDER_MODULES = ['weixin_spider.spiders'] 15 | NEWSPIDER_MODULE = 'weixin_spider.spiders' 16 | 17 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'weixin_spider (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | import random 33 | DOWNLOAD_DELAY = random.randint(2, 3) 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | #COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'weixin_spider.middlewares.WeixinSpiderSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | DOWNLOADER_MIDDLEWARES = { 59 | 'weixin_spider.middlewares.RandomUserAgent': 10, 60 | 'weixin_spider.middlewares.ProxyMiddleWare': 100, 61 | } 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | 'weixin_spider.pipelines.WeixinSpiderPipeline': 300, 73 | } 74 | 75 | HOST_IP = 'localhost' 76 | PORT = '3306' 77 | USER = 'root' 78 | PASSWD = 'nlp' 79 | DB_NAME = 'weixin_xiaoshuo' 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | #AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | #AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | #AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | #AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | #HTTPCACHE_ENABLED = True 97 | #HTTPCACHE_EXPIRATION_SECS = 0 98 | #HTTPCACHE_DIR = 'httpcache' 99 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | -------------------------------------------------------------------------------- /ie/craw/weixin_spider/weixin_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/deepdive/db.url: -------------------------------------------------------------------------------- 1 | postgresql://localhost:5432/movie 2 | -------------------------------------------------------------------------------- /ie/deepdive/deepdive.conf: -------------------------------------------------------------------------------- 1 | deepdive.calibration.holdout_fraction:0.25 2 | deepdive.sampler.sampler_args: "-l 1000 -s 1 -i 1000 --alpha 0.01 --sample_evidence" 3 | -------------------------------------------------------------------------------- /ie/deepdive/input/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/input/__init__.py -------------------------------------------------------------------------------- /ie/deepdive/start_posql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | /etc/init.d/postgresql start 3 | -------------------------------------------------------------------------------- /ie/deepdive/udf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/udf/__init__.py -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/udf/baidu_baike/baidu_baike/__init__.py -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaiduBaikeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | article_id = scrapy.Field() 15 | articles = scrapy.Field() 16 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | class WeixinSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | class RandomUserAgent: 59 | def __init__(self, agents): 60 | self.agents =[ 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 62 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0. 30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 63 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 64 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 65 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 66 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 67 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 68 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 69 | ] 70 | 71 | @classmethod 72 | def from_crawler(cls,crawler): 73 | # 获取settings的USER_AGENT列表并返回 74 | return cls(crawler.settings.getlist('USER_AGENTS')) 75 | def process_request(self, request, spider): 76 | # 随机设置Request报头header的User-Agent 77 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 78 | 79 | # 添加代理 80 | 81 | class ProxyMiddleWare(object): 82 | proxy_list=[ 83 | "http://58.87.89.234:3128", 84 | "http://139.201.202.140:53281", 85 | "http://27.37.123.30:9000", 86 | "http://218.67.82.146:36709", 87 | "http://222.222.169.60:53281", 88 | "http://120.33.247.233:46884", 89 | "http://114.215.18.7:3128", 90 | "http://112.74.94.142:3128", 91 | "http://122.72.18.34:80", 92 | "http://36.33.25.123:808", 93 | "http://123.138.89.133:9999", 94 | "http://111.231.192.61:8080", 95 | "http://59.41.202.228:53281", 96 | "http://222.241.14.187:8888", 97 | "http://61.155.164.106:3128", 98 | "http://27.40.156.43:61234", 99 | "http://14.29.84.50:8080", 100 | "http://116.25.100.62:9797", 101 | "http://58.21.183.144:80", 102 | "http://14.221.166.205:9000", 103 | "http://115.231.50.10:53281", 104 | "http://120.34.205.40:808", 105 | "http://123.139.56.238:9999", 106 | "http://113.116.170.232:9000", 107 | "http://116.17.236.36:808", 108 | "http://114.232.163.73:34837", 109 | "http://171.35.103.37:808", 110 | "http://27.46.51.232:9797", 111 | "http://223.247.255.207:24714", 112 | "http://223.241.117.179:8010", 113 | "http://222.186.12.102:57624"] 114 | 115 | def process_request(self,request,spider): 116 | # if not request.meta['proxies']: 117 | ip = random.choice(self.proxy_list) 118 | request.meta['proxy'] = ip 119 | 120 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import pymysql 12 | from pymysql import connections 13 | from baidu_baike import settings 14 | 15 | 16 | class BaiduBaikePipeline(object): 17 | def __init__(self): 18 | self.article_file = open("articles.txt", "a+",encoding='utf-8') 19 | 20 | def process_item(self, item, spider): 21 | # process info for actor 22 | articles = bytes.decode(str(item['articles']).encode('utf-8')).replace("\n", " ") 23 | article_id = bytes.decode(str(item['article_id']).encode('utf-8')) 24 | 25 | self.article_file.write(article_id + "," + articles + "\n") 26 | 27 | def close_spider(self, spider): 28 | self.article_file.close() 29 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidu_baike project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidu_baike' 13 | 14 | SPIDER_MODULES = ['baidu_baike.spiders'] 15 | NEWSPIDER_MODULE = 'baidu_baike.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | #import random 32 | #DOWNLOAD_DELAY = random.randint(0, 1) 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543, 59 | #} 60 | DOWNLOADER_MIDDLEWARES = { 61 | 'baidu_baike.middlewares.RandomUserAgent': 10, 62 | 'baidu_baike.middlewares.ProxyMiddleWare': 100, 63 | } 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 74 | #} 75 | ITEM_PIPELINES = { 76 | 'baidu_baike.pipelines.BaiduBaikePipeline': 300, 77 | } 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/baidu_baike/spiders/baidu_baike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | 9 | from baidu_baike.items import BaiduBaikeItem 10 | import scrapy 11 | from scrapy.http import Request 12 | from bs4 import BeautifulSoup 13 | import re 14 | import urllib 15 | 16 | class BaiduBaikeSpider(scrapy.Spider, object): 17 | name = 'baidu' 18 | allowed_domains = ["baike.baidu.com"] 19 | start_urls = ['https://baike.baidu.com/item/%E5%91%A8%E6%98%9F%E9%A9%B0/169917?fr=aladdin'] 20 | global article_id 21 | article_id = 0 22 | # start_urls = ['https://baike.baidu.com/item/%E4%B8%83%E5%B0%8F%E7%A6%8F'] 23 | 24 | def _get_from_findall(self, tag_list): 25 | result = [] 26 | 27 | for slist in tag_list: 28 | tmp = slist.get_text() 29 | result.append(tmp) 30 | return result 31 | 32 | def parse(self, response): 33 | global article_id 34 | page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract() 35 | page_category = [l.strip() for l in page_category] 36 | item = BaiduBaikeItem() 37 | 38 | item['article_id'] = article_id 39 | item['articles'] = '' 40 | 41 | if u'演员' in page_category or u'电影' in page_category: 42 | print("Get a actor/movie page") 43 | soup = BeautifulSoup(response.text, 'lxml') 44 | root_node = soup.find("div", class_ = "main_tab main_tab-defaultTab curTab") 45 | 46 | para_nodes = soup.find_all("div", class_="para") 47 | basic_item = self._get_from_findall(para_nodes) 48 | article_content = ' '.join(basic_item) 49 | article_content = article_content.replace("\n", " ") 50 | item['articles'] = str(article_content) 51 | article_id += 1 52 | yield item 53 | if article_id % 50 == 0: 54 | print("The nums of total articles up to: {}".format(article_id)) 55 | 56 | 57 | soup = BeautifulSoup(response.text, 'lxml') 58 | links = soup.find_all('a', href=re.compile(r"/item/")) 59 | for link in links: 60 | new_url = link["href"] 61 | new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) 62 | yield scrapy.Request(new_full_url, callback=self.parse) 63 | -------------------------------------------------------------------------------- /ie/deepdive/udf/baidu_baike/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidu_baike.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidu_baike 12 | -------------------------------------------------------------------------------- /ie/deepdive/udf/extract_play_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from deepdive import * 5 | import ddlib 6 | 7 | @tsv_extractor 8 | @returns(lambda 9 | p1_id = "text", 10 | p2_id = "text", 11 | feature = "text", 12 | :[]) 13 | def extract( 14 | p1_id = "text", 15 | p2_id = "text", 16 | p1_begin_index = "int", 17 | p1_end_index = "int", 18 | p2_begin_index = "int", 19 | p2_end_index = "int", 20 | doc_id = "text", 21 | sent_index = "int", 22 | tokens = "text[]", 23 | lemmas = "text[]", 24 | pos_tags = "text[]", 25 | ner_tags = "text[]", 26 | dep_types = "text[]", 27 | dep_parents = "int[]", 28 | ): 29 | """ 30 | Uses DDLIB to generate features for the spouse relation. 31 | """ 32 | # Create a DDLIB sentence object, which is just a list of DDLIB Word objects 33 | sent = [] 34 | for i,t in enumerate(tokens): 35 | sent.append(ddlib.Word( 36 | begin_char_offset=None, 37 | end_char_offset=None, 38 | word=t, 39 | lemma=lemmas[i], 40 | pos=pos_tags[i], 41 | ner=ner_tags[i], 42 | dep_par=dep_parents[i] - 1, # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT 43 | dep_label=dep_types[i])) 44 | 45 | # Create DDLIB Spans for the two person mentions 46 | p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1)) 47 | p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1)) 48 | 49 | # Generate the generic features using DDLIB 50 | for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span): 51 | yield [p1_id, p2_id, feature] 52 | -------------------------------------------------------------------------------- /ie/deepdive/udf/get_actor_movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | try: 8 | import simplejson as json 9 | except: 10 | import json 11 | 12 | import pymysql 13 | from pymysql import connections 14 | from collections import defaultdict 15 | 16 | 17 | class connec_mysql(object): 18 | def __init__(self): 19 | self.conn = pymysql.connect( 20 | host='localhost', 21 | user='root', 22 | passwd='root', 23 | db='baidu_baike', 24 | charset='utf8mb4', 25 | use_unicode=True 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def get_actor_movie(self, filename, out_name): 30 | outfile = open(out_name,'w',encoding='UTF-8') 31 | with open(filename) as f: 32 | lines = f.readlines() 33 | for line in lines: 34 | words = line.strip().split() 35 | if len(words) != 2: 36 | print("Got line with wrong fromat~") 37 | continue 38 | actor_id = words[0] 39 | movie_id = words[1] 40 | self.cursor.execute( 41 | "SELECT actor_chName, actor_foreName FROM actor WHERE actor_id = {}".format(actor_id)) 42 | actor_list = self.cursor.fetchall() 43 | actor_chName, actor_foreName = actor_list[0] 44 | self.cursor.execute( 45 | "SELECT movie_chName, movie_foreName FROM movie WHERE movie_id = {}".format(movie_id)) 46 | movie_list = self.cursor.fetchall() 47 | movie_chName, movie_foreName = movie_list[0] 48 | for item_actor in [actor_chName, actor_foreName]: 49 | for item_movie in [movie_chName, movie_foreName]: 50 | if item_actor not in ["None", ""] and item_movie not in ["None", ""]: 51 | outfile.write(item_actor + "," + item_movie + "\n") 52 | 53 | outfile.close() 54 | 55 | 56 | if __name__ == "__main__": 57 | connect_sql = connec_mysql() 58 | connect_sql.get_actor_movie("../input/actor_movie.txt", "../input/actor_movie_dbdata.csv") 59 | -------------------------------------------------------------------------------- /ie/deepdive/udf/map_actor_mention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from deepdive import * 5 | import re 6 | 7 | @tsv_extractor 8 | @returns(lambda 9 | mention_id = "text", 10 | mention_text = "text", 11 | doc_id = "text", 12 | sentence_index = "int", 13 | begin_index = "int", 14 | end_index = "int", 15 | :[]) 16 | def extract( 17 | doc_id = "text", 18 | sentence_index = "int", 19 | tokens = "text[]", 20 | pos_tags = "text[]", 21 | ner_tags = "text[]", 22 | ): 23 | """ 24 | Finds phrases thar are continuous words with POS tags == MISC and NER tags == NN. 25 | We make this decision due to stanford parser got bad performance when recognizing actor. 26 | """ 27 | num_tokens = len(ner_tags) 28 | first_index = ( i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and pos_tags[i] == "NR" and (i == 0 or (ner_tags[i-1] != "PERSON" and pos_tags[i-1] != "NR" )) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[i], "utf-8")) != None) 29 | for begin_index in first_index: 30 | end_index = begin_index + 1 31 | while end_index < num_tokens and ner_tags[end_index] == "PERSON" and pos_tags[end_index] == "NR" and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[end_index], "utf-8")) != None: 32 | end_index += 1 33 | end_index -= 1 34 | mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index) 35 | mention_text = "".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1))) 36 | 37 | yield [ 38 | mention_id, 39 | mention_text, 40 | doc_id, 41 | sentence_index, 42 | begin_index, 43 | end_index, 44 | ] 45 | 46 | -------------------------------------------------------------------------------- /ie/deepdive/udf/map_movie_mention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from deepdive import * 5 | import re 6 | 7 | @tsv_extractor 8 | @returns(lambda 9 | mention_id = "text", 10 | mention_text = "text", 11 | doc_id = "text", 12 | sentence_index = "int", 13 | begin_index = "int", 14 | end_index = "int", 15 | :[]) 16 | def extract( 17 | doc_id = "text", 18 | sentence_index = "int", 19 | tokens = "text[]", 20 | pos_tags = "text[]", 21 | ner_tags = "text[]", 22 | ): 23 | """ 24 | Finds phrases thar are continuous words with POS tags == MISC and NER tags == NN. 25 | We make this decision due to stanford parser got bad performance when recognizing movie. 26 | """ 27 | num_tokens = len(ner_tags) 28 | first_index = ( i for i in xrange(num_tokens) if ner_tags[i] == "MISC" and pos_tags[i] == "NN" and (i == 0 or (ner_tags[i-1] != "MISC" and pos_tags[i-1] != "NN" )) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[i], "utf-8")) != None) 29 | for begin_index in first_index: 30 | end_index = begin_index + 1 31 | while end_index < num_tokens and ner_tags[end_index] == "MISC" and pos_tags[end_index] == "NN" and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[end_index], "utf-8")) != None: 32 | end_index += 1 33 | end_index -= 1 34 | mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index) 35 | mention_text = "".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1))) 36 | 37 | yield [ 38 | mention_id, 39 | mention_text, 40 | doc_id, 41 | sentence_index, 42 | begin_index, 43 | end_index, 44 | ] 45 | 46 | -------------------------------------------------------------------------------- /ie/deepdive/udf/map_play_candidate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from deepdive import * 4 | import re 5 | 6 | @tsv_extractor 7 | @returns(lambda 8 | p1_id = "text", 9 | p1_name = "text", 10 | p2_id = "text", 11 | p2_name = "text", 12 | :[]) 13 | def extract( 14 | p1_id = "text", 15 | p1_name = "text", 16 | p2_id = "text", 17 | p2_name = "text", 18 | ): 19 | if not(set(p1_name) <= set(p2_name) or set(p2_name) <= set(p1_name)): 20 | yield [ 21 | p1_id, 22 | p1_name, 23 | p2_id, 24 | p2_name, 25 | ] 26 | -------------------------------------------------------------------------------- /ie/deepdive/udf/nlp_markup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # A shell script that runs Bazaar/Parser over documents passed as input TSV lines 3 | # 4 | # $ deepdive env udf/nlp_markup.sh doc_id _ _ content _ 5 | ## 6 | set -euo pipefail 7 | cd "$(dirname "$0")" 8 | 9 | : ${BAZAAR_HOME:=$PWD/bazaar} 10 | [[ -x "$BAZAAR_HOME"/parser/target/start ]] || { 11 | echo "No Bazaar/Parser set up at: $BAZAAR_HOME/parser" 12 | exit 2 13 | } >&2 14 | 15 | [[ $# -gt 0 ]] || 16 | # default column order of input TSV 17 | set -- doc_id content 18 | 19 | # convert input tsv lines into JSON lines for Bazaar/Parser 20 | 21 | 22 | # start Bazaar/Parser to emit sentences TSV 23 | tsv2json "$@" | 24 | "$BAZAAR_HOME"/parser/run.sh -i json -k doc_id -v content 25 | -------------------------------------------------------------------------------- /ie/deepdive/udf/supervise_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from deepdive import * 5 | import random 6 | from collections import namedtuple 7 | 8 | PlayLabel = namedtuple('PlayLabel', 'p1_id, p2_id, label, type') 9 | 10 | @tsv_extractor 11 | @returns(lambda 12 | p1_id = "text", 13 | p2_id = "text", 14 | label = "int", 15 | rule_id = "text", 16 | :[]) 17 | # heuristic rules for finding positive/negative examples of play relationship mentions 18 | def supervise( 19 | p1_id="text", p1_begin="int", p1_end="int", 20 | p2_id="text", p2_begin="int", p2_end="int", 21 | doc_id="text", sentence_index="int", sentence_text="text", 22 | tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]", 23 | dep_types="text[]", dep_token_indexes="int[]", 24 | ): 25 | PLAY = frozenset(["出演", "主演", "参演", "友情出演", "饰演", "特别出演"]) 26 | 27 | COMMAS = frozenset([":", ":","1","2","3","4","5","6","7","8","9","0","、", ";", ";"]) 28 | MAX_DIST = 40 29 | 30 | # Common data objects 31 | intermediate_lemmas = lemmas[p1_end+1:p2_begin] 32 | intermediate_ner_tags = ner_tags[p1_end+1:p2_begin] 33 | tail_lemmas = lemmas[p2_end+1:] 34 | play = PlayLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None) 35 | 36 | if len(intermediate_lemmas) > MAX_DIST: 37 | yield play._replace(label=-1, type='neg:far_apart') 38 | 39 | if 'PERSON' in intermediate_ner_tags: 40 | yield play._replace(label=-1, type='neg:third_person_between') 41 | 42 | if len(COMMAS.intersection(intermediate_lemmas)) > 0: 43 | yield play._replace(label=-1, type='neg:中间有特殊符号') 44 | 45 | if len(PLAY.intersection(intermediate_lemmas)) > 0: 46 | yield play._replace(label=1, type='pos:A出演B') 47 | -------------------------------------------------------------------------------- /ie/deepdive/udf/trans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | 5 | with open("baidu_baike/articles.txt",encoding='utf-8') as f: 6 | with open("./articles.csv", "w+",encoding='utf-8') as o: 7 | lines = f.readlines() 8 | for line in lines: 9 | words = line.strip().split(",") 10 | id = words[0] 11 | content = words[1:] 12 | text = ','.join(content) 13 | text = text.replace(",", ",") 14 | o.write(id + "," + text + "\n") 15 | 16 | -------------------------------------------------------------------------------- /ie/re_cnn_att/clean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import re 4 | import commands 5 | 6 | 7 | class Clean(object): 8 | @staticmethod 9 | def clean_word(word, clean_level='others'): 10 | """ 11 | Remove symbols in words 12 | :word word with unicode 13 | :clean_level keep different symbols for disambi/title 14 | :return clean word 15 | """ 16 | word = word.strip() 17 | 18 | if clean_level == "title": 19 | word = word.strip().strip("\"").replace("\n", " ").replace("\"", "").strip(u"\\") 20 | elif clean_level == "subject": 21 | word = word.replace("\"", "").strip("\\").strip() 22 | elif clean_level == "redirect": 23 | word = word.strip("\"") 24 | elif clean_level == "disambi": 25 | word = re.sub( 26 | u"[,。、&∈*.↑【2—‘:“#> BFR·Z<bf≈j×~①Ⅲ⑤⑨÷〔!%》-』1→5=AE∧I/″▲;]ξaeφi}④⑧…─☆《『0В<D∪L±γ′TXλ:dh|③⑦~、℃'〉+」/】3〕Δ’;”?■CGΨ[=μ_cgβ㈧o{②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\, \xa0\u00a0\u3000]", 27 | "", word) 28 | elif clean_level == 'others': 29 | word = re.sub( 30 | u"[,。、&∈*.↑【2—‘:“#> BFR·Z<bf≈j×~①Ⅲ⑤⑨÷〔!%)》-』1→5=AE∧I/″▲;]ξaeφi}④⑧…─☆(《『0В<D∪L±γ′TXλ:dh|③⑦~、℃'〉+」/】3〕Δ’;”?■CGΨ[=μ_cgβ㈧o{②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\,\xa0\u00a0\u3000\r\n]", 31 | "", word) 32 | return word 33 | 34 | 35 | class ProcessFile(object): 36 | @staticmethod 37 | def get_line(self, filename): 38 | total_lines = commands.getoutput("sed -n '$=' {}".format(filename)) 39 | with open(filename, "r", encoding='utf-8') as inf: 40 | for line_num in range(total_lines): 41 | line = inf.readline().strip() 42 | yield line 43 | -------------------------------------------------------------------------------- /ie/re_cnn_att/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/re_cnn_att/data/__init__.py -------------------------------------------------------------------------------- /ie/re_cnn_att/word2vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from gensim.models import word2vec 4 | from tqdm import tqdm 5 | import json 6 | import os 7 | import jieba 8 | from gen_re_from_baidu import LoadFile 9 | 10 | 11 | def cut_words(line): 12 | seg_line = " ".join(jieba.cut(line)) 13 | return seg_line 14 | 15 | 16 | def seg_file(infile="", outfile=""): 17 | with open(outfile, "w", encoding='utf-8') as ouf: 18 | for line in tqdm(LoadFile.readline(infile)): 19 | seg_line = cut_words(line) 20 | ouf.write(seg_line) 21 | 22 | 23 | def transfer_json(in_file_path, out_file_path): 24 | with open(in_file_path, "r", encoding='utf-8') as inf: 25 | ouf = open(out_file_path, "w", encoding='utf-8') 26 | word_embed_list = [] 27 | word_num, dim = inf.readline().strip().split() 28 | print("Total word_num: ", word_num, "\nWord dim: ", dim) 29 | for line_num in tqdm(range(int(word_num))): 30 | word_dict = {} 31 | words = inf.readline().strip().split() 32 | word_dict["word"] = words[0] 33 | word_dict["vec"] = eval("[" + ",".join(words[1:]) + "]") 34 | word_embed_list.append(word_dict) 35 | json.dump(word_embed_list, ouf) 36 | 37 | 38 | if __name__ == "__main__": 39 | seg_file(infile="data/6w_clean_disambi_text.csv", outfile="seg_6w_disambi_text.txt") 40 | os.system("C:\my\word2vec\word2vec -train seg_6w_disambi_text.txt -output word_vec.txt -size 50 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 0 -iter 3 -min-count 1 -hs 1") 41 | transfer_json("word_vec.txt", "word_vec.json") 42 | 43 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from clean import Clean 4 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/clean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import re 4 | import commands 5 | 6 | class Clean(object): 7 | @staticmethod 8 | def clean_word(word, clean_level='others'): 9 | """ 10 | Remove symbols in words 11 | :word word with unicode 12 | :clean_level keep different symbols for disambi/title 13 | :return clean word 14 | """ 15 | word = word.strip() 16 | 17 | if clean_level == "title": 18 | word = word.strip().strip("\"").replace("\n", " ").replace("\"","").strip(u"\\") 19 | elif clean_level == "subject": 20 | word = word.replace("\"", "").strip("\\").strip() 21 | elif clean_level == "redirect": 22 | word = word.strip("\"") 23 | elif clean_level == "disambi": 24 | word = re.sub(u"[,。、&∈*.↑【2—‘:“#> BFR·Z<bf≈j×~①Ⅲ⑤⑨÷〔!%》-』1→5=AE∧I/″▲;]ξaeφi}④⑧…─☆《『0В<D∪L±γ′TXλ:dh|③⑦~、℃'〉+」/】3〕Δ’;”?■CGΨ[=μ_cgβ㈧o{②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\, \xa0\u00a0\u3000\r\n]", "", word) 25 | elif clean_level == 'others': 26 | word = re.sub(u"[,。、&∈*.↑【2—‘:“#> BFR·Z<bf≈j×~①Ⅲ⑤⑨÷〔!%)》-』1→5=AE∧I/″▲;]ξaeφi}④⑧…─☆(《『0В<D∪L±γ′TXλ:dh|③⑦~、℃'〉+」/】3〕Δ’;”?■CGΨ[=μ_cgβ㈧o{②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\,\xa0\u00a0\u3000\r\n]", "", word) 27 | return word 28 | 29 | class ProcessFile(object): 30 | @staticmethod 31 | def get_line(self, filename): 32 | total_lines = commands.getoutput("sed -n '$=' {}".format(filename)) 33 | with open(filename) as inf: 34 | for line_num in range(total_lines): 35 | line = inf.readline().strip() 36 | yield line 37 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/cleanFile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from tqdm import tqdm 4 | from clean import Clean 5 | def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"): 6 | with open(infile, "r",encoding='utf-8') as inf: 7 | lines = inf.readlines() 8 | err_counts = 0 9 | with open(outfile, "w",encoding='utf-8') as ouf: 10 | for line in tqdm(lines): 11 | words = line.strip().split("\",\"") 12 | if len(words) != 2: 13 | err_counts += 1 14 | continue 15 | title = Clean.clean_word(words[0], clean_level='title') 16 | disambi = Clean.clean_word(words[1], clean_level='disambi') 17 | ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n") 18 | print("err_counts for disambi_redirect: ", err_counts) 19 | 20 | 21 | def clean_disambi_redirect(infile="disambi_redirect.csv", outfile="disambi_redirect_out.csv"): 22 | with open(infile, "r",encoding='utf-8') as inf: 23 | lines = inf.readlines() 24 | err_counts = 0 25 | with open(outfile, "w",encoding='utf-8') as ouf: 26 | for line in tqdm(lines): 27 | words = line.strip().split("\",\"") 28 | if len(words) != 2: 29 | err_counts += 1 30 | continue 31 | disambi = Clean.clean_word(words[0], clean_level='disambi') 32 | redirect = Clean.clean_word(words[1], clean_level='redirect') 33 | ouf.write("\"" + disambi + "\",\"" + redirect + "\"\r\n") 34 | print("err_counts for disambi_redirect: ", err_counts) 35 | 36 | 37 | def clean_disambi_subject(infile="disambi_subject.csv", outfile="disambi_subject_out.csv"): 38 | with open(infile, "r",encoding='utf-8') as inf: 39 | lines = inf.readlines() 40 | err_counts = 0 41 | with open(outfile, "w",encoding='utf-8') as ouf: 42 | for line in tqdm(lines): 43 | words = line.strip().split("\",\"") 44 | if len(words) != 2: 45 | err_counts += 1 46 | continue 47 | disambi = Clean.clean_word(words[0], clean_level='disambi') 48 | subject = Clean.clean_word(words[1], clean_level='subject') 49 | ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n") 50 | print("err_counts for disambi_redirect: ", err_counts) 51 | 52 | 53 | if __name__ == '__main__': 54 | clean_title_disambi(infile="./410_baidu/410_title_disambi.csv", outfile="./410_baidu/410_title_disambi_out.csv") 55 | clean_disambi_redirect(infile="./410_baidu/410_disambi_redirect.csv", outfile="./410_baidu/410_disambi_redirect_out.csv") 56 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/gen_disambi_infobox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import re 4 | import json 5 | import re 6 | from tqdm import tqdm 7 | from clean import Clean 8 | 9 | def get_word_list(filename): 10 | with open(filename, "r",encoding='utf-8') as inf: 11 | lines = inf.readlines() 12 | # print "type line: ", type(lines[0].encode("utf-8")) 13 | lines = [Clean.clean_word(line, clean_level='title') for line in lines] 14 | return lines 15 | 16 | 17 | print(Clean.clean_word(u"\"你好 呀#\"$%^&*@!,。、;:‘’】季    候【")) 18 | 19 | 20 | def main(): 21 | with open("./410_baidu/410_disambi_infobox.csv",'r',encoding='UTF-8') as inf: 22 | lines = inf.readlines() 23 | f = open("./410_baidu/410_disambi_infobox_out.csv", "w",encoding='utf-8') 24 | list_attr = [] 25 | title_list = get_word_list("./410_baidu/410_title.csv") 26 | err_count = 0 27 | counts = {} 28 | for line in tqdm(lines): 29 | words = line.strip().split(",") 30 | disambi = Clean.clean_word(words[0], clean_level='disambi') 31 | infobox = ",".join(words[1:]) 32 | try: 33 | info_dict = json.loads(json.loads(infobox)) 34 | for attr in info_dict.keys(): 35 | clean_attr = Clean.clean_word(attr) 36 | info_dict[clean_attr] = info_dict.pop(attr) 37 | value = info_dict[clean_attr] 38 | clean_attr = clean_attr 39 | counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1 40 | list_attr.append(clean_attr) 41 | value_split = re.split(u"[,。、,/]", value.strip()) 42 | for v in value_split: 43 | v = Clean.clean_word(v).strip(u"等").strip(u"收起") 44 | title_list.append(v) 45 | f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n") 46 | except Exception as e: 47 | print(e) 48 | err_count += 1 49 | title_list = [t.strip(u"\\") for t in title_list] 50 | title_list = list(set(title_list)) 51 | list_attr = list(set(list_attr)) 52 | sort_counts = sorted(counts.items(),key = lambda x:x[1],reverse = True) 53 | with open("./sort_counts.txt", "w",encoding='utf-8') as ouf: 54 | for i in sort_counts: 55 | ouf.write(str(i) + "\n") 56 | with open("./all_attr.txt", "w",encoding='utf-8') as ouf: 57 | for word_counts in sort_counts: 58 | if word_counts[1] >= 10: 59 | ouf.write(str(word_counts[0]) + "\n") 60 | with open("./410_baidu/410_title_new.csv", "w",encoding='utf-8') as ouf: 61 | for i in title_list: 62 | ouf.write("\"" + i + "\"\r\n") 63 | with open("./410_baidu/all_attr.txt", "w",encoding='utf-8') as ouf: 64 | for i in list_attr: 65 | ouf.write(i + "\n") 66 | 67 | print("err_count: ", err_count) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/get_subject.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from collections import defaultdict 5 | from clean import Clean 6 | from tqdm import tqdm 7 | 8 | with open("./410_baidu/410_disambi_subject.csv", "r",encoding='utf-8') as inf: 9 | lines = inf.readlines() 10 | # all_subject = defaultdict(list) 11 | total_subject = [] 12 | f = open("./410_baidu/disambi_subject.csv", "w",encoding='utf-8') 13 | for line in tqdm(lines): 14 | words = line.strip().split(",") 15 | disambi = Clean.clean_word(words[0], clean_level='disambi') 16 | subjects = words[1:] 17 | subjects = [Clean.clean_word(s, clean_level="subject") for s in subjects] 18 | # subjects = [s.replace("\"", "").strip("\\") for s in subjects] 19 | # subjects = [s.strip() for s in subjects] 20 | total_subject.extend(subjects) 21 | for subject in subjects: 22 | if subject == "": 23 | continue 24 | f.write("\"" + disambi + "\",\"" + subject + "\"\r\n") 25 | # all_subject[disambi].append(subjects) 26 | f.close() 27 | total_subject = list(set(total_subject)) 28 | print("Total subjects: ", len(total_subject)) 29 | with open("./410_baidu/all_subject.csv", "w",encoding='utf-8') as ouf: 30 | ouf.write("\"" + "\"\n\"".join(total_subject) + "\"") 31 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/disambi_headers.csv: -------------------------------------------------------------------------------- 1 | disambi:ID(Disambi),title,abstract,curLink,exterLink -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/disambi_infobox_header.csv: -------------------------------------------------------------------------------- 1 | :START_ID(Disambi),role,:END_ID(Title) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/disambi_redirect_header.csv: -------------------------------------------------------------------------------- 1 | :START_ID(Disambi),:END_ID(Redirect) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/disambi_subject_header.csv: -------------------------------------------------------------------------------- 1 | :START_ID(Disambi),:END_ID(Subject) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/redirect_header.csv: -------------------------------------------------------------------------------- 1 | redirect:ID(Redirect) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/subject_header.csv: -------------------------------------------------------------------------------- 1 | subject:ID(Subject) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/title_disambi_header.csv: -------------------------------------------------------------------------------- 1 | :START_ID(Disambi),:END_ID(Title) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/header_file/title_header.csv: -------------------------------------------------------------------------------- 1 | title:ID(Title) -------------------------------------------------------------------------------- /ie/struct_to_rdf/baidu2neo4j/remove_disambi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import re 4 | from clean import Clean 5 | from tqdm import tqdm 6 | 7 | with open("./410_baidu/410_disambi.csv", "r",encoding='utf-8') as inf: 8 | title_dict = {} 9 | count = 0 10 | lines = inf.readlines() 11 | for line in tqdm(lines): 12 | words = line.strip().split("\",\"") 13 | if len(words) != 4: 14 | count += 1 15 | clean_disambi = Clean.clean_word(words[0], 'disambi') 16 | title_dict[clean_disambi] = words[1:] 17 | print("Error lines: ", count) 18 | with open("./410_baidu/410_disambi_new.csv", "w",encoding='utf-8') as ouf: 19 | for i in title_dict.keys(): 20 | ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n") 21 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/movie_actor/clean_actor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | """ 5 | Get the table of actor_to_actor and actor_to_genre. 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | import pymysql 16 | from pymysql import connections 17 | import numpy as np 18 | import re 19 | 20 | class connec_mysql(object): 21 | def __init__(self): 22 | self.conn = pymysql.connect( 23 | host='localhost', 24 | user='root', 25 | passwd='nlp', 26 | db='hudong_baike', 27 | charset='utf8mb4', 28 | use_unicode=True 29 | ) 30 | self.cursor = self.conn.cursor() 31 | 32 | def process_actor_gen(self): 33 | actor_gen_id = 0 34 | self.cursor.execute("SELECT MAX(actor_id) FROM actor_back") 35 | max_actor_id = self.cursor.fetchall()[0][0] 36 | assert isinstance(max_actor_id, int) 37 | for actor_id in range(1, max_actor_id + 1): 38 | # for actor_id in range(1, 1 + 10): 39 | self.cursor.execute("SELECT * FROM actor_back WHERE actor_id = {};".format(actor_id)) 40 | result = self.cursor.fetchall() 41 | if np.shape(result) != (1, 11): 42 | continue 43 | new_actor_list = [ result[0][i].replace(u'title="" href=""', "") if not isinstance(result[0][i], int) else result[0][i] for i in range(0, 11) ] 44 | new_actor_list = [ new_actor_list[i].strip(u' 《》') if not isinstance(new_actor_list[i], int) else new_actor_list[i] for i in range(0, 11) ] 45 | new_actor_tuple = tuple(new_actor_list) 46 | sql = """ 47 | INSERT INTO actor( actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 48 | """ 49 | self.cursor.execute(sql, new_actor_tuple) 50 | self.conn.commit() 51 | 52 | if __name__ == '__main__': 53 | connec = connec_mysql() 54 | connec.process_actor_gen() 55 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/movie_actor/clean_mysql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | """ 5 | Get the table of actor_to_movie and movie_to_genre. 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | import pymysql 16 | from pymysql import connections 17 | import numpy as np 18 | import re 19 | 20 | class connec_mysql(object): 21 | def __init__(self): 22 | self.conn = pymysql.connect( 23 | host='localhost', 24 | user='root', 25 | passwd='nlp', 26 | db='hudong_baike', 27 | charset='utf8mb4', 28 | use_unicode=True 29 | ) 30 | self.cursor = self.conn.cursor() 31 | 32 | def process_movie_gen(self): 33 | movie_gen_id = 0 34 | self.cursor.execute("SELECT MAX(movie_id) FROM movie_back") 35 | max_movie_id = self.cursor.fetchall()[0][0] 36 | assert isinstance(max_movie_id, int) 37 | for movie_id in range(1, max_movie_id + 1): 38 | # for movie_id in range(1, 1 + 1): 39 | self.cursor.execute("SELECT * FROM movie_back WHERE movie_id = {};".format(movie_id)) 40 | result = self.cursor.fetchall() 41 | print("np.shape(result): ", np.shape(result)) 42 | if np.shape(result) != (1, 14): 43 | continue 44 | new_movie_list = [ result[0][i].strip(u" 《》") if not isinstance(result[0][i], int) else result[0][i] for i in range(0, 14) ] 45 | # new_movie_list = [result[0][i] if i != 2 else movie_name for i in range(0, 14)] 46 | new_movie_tuple = tuple(new_movie_list) 47 | sql = """ 48 | INSERT INTO movie( movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 49 | """ 50 | self.cursor.execute(sql, new_movie_tuple) 51 | self.conn.commit() 52 | 53 | if __name__ == '__main__': 54 | connec = connec_mysql() 55 | connec.process_movie_gen() 56 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/movie_actor/complete_mysql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | """ 5 | Get the table of actor_to_movie and movie_to_genre. 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | import pymysql 16 | from pymysql import connections 17 | import numpy as np 18 | import re 19 | 20 | class connec_mysql(object): 21 | def __init__(self): 22 | self.conn = pymysql.connect( 23 | host='localhost', 24 | user='root', 25 | passwd='nlp', 26 | db='hudong_baike', 27 | charset='utf8mb4', 28 | use_unicode=True 29 | ) 30 | self.cursor = self.conn.cursor() 31 | 32 | def process_act_movie(self): 33 | actor_movie_id = 0 34 | self.cursor.execute("SELECT MAX(actor_id) FROM actor") 35 | max_actor_id = self.cursor.fetchall()[0][0] 36 | assert isinstance(max_actor_id, int) 37 | for actor_id in range(1, max_actor_id + 1): 38 | self.cursor.execute("SELECT actor_repworks FROM actor WHERE actor_id = {};".format(actor_id)) 39 | result = self.cursor.fetchall() 40 | assert np.shape(result) == (1, 1) # if didn't exist, return (0, ) 41 | repworks = re.split(u"[,/、 ]", result[0][0] ) 42 | try: 43 | assert len(repworks) > 0 44 | for repwork in repworks: 45 | repwork = repwork.strip(u" 《》") 46 | self.cursor.execute("SELECT movie_id FROM movie WHERE movie_chName = %s", repwork) 47 | check_movie_id = self.cursor.fetchall() 48 | if len(check_movie_id) != 0: 49 | self.cursor.execute("INSERT INTO actor_to_movie (actor_movie_id, actor_id, movie_id) VALUES (%s, %s, %s)", (actor_movie_id, actor_id, check_movie_id[0][0]) ) 50 | self.conn.commit() 51 | actor_movie_id += 1 52 | except Exception as e: 53 | print("Get a error with ", e, "Maybe this actor has no represent works") 54 | continue 55 | 56 | def process_movie_gen(self): 57 | movie_gen_id = 0 58 | self.cursor.execute("SELECT MAX(movie_id) FROM movie") 59 | max_movie_id = self.cursor.fetchall()[0][0] 60 | assert isinstance(max_movie_id, int) 61 | for movie_id in range(1, max_movie_id + 1): 62 | # for movie_id in range(1, 1 + 10): 63 | self.cursor.execute("SELECT movie_genre FROM movie WHERE movie_id = {};".format(movie_id)) 64 | result = self.cursor.fetchall() 65 | if np.shape(result) != (1, 1): 66 | continue 67 | movie_genres = re.split(u"[,/、 ]", result[0][0] ) 68 | # print("movie_genres: ", movie_genres) 69 | try: 70 | assert len(movie_genres) > 0 71 | for movie_genre in movie_genres: 72 | self.cursor.execute("SELECT genre_id FROM genre WHERE genre_name = %s", movie_genre) 73 | check_genre_id = self.cursor.fetchall() 74 | if len(check_genre_id) != 0: 75 | self.cursor.execute("INSERT INTO movie_to_genre (movie_genre_id, movie_id, genre_id) VALUES (%s, %s, %s)", (movie_gen_id, movie_id, check_genre_id[0][0]) ) 76 | self.conn.commit() 77 | movie_gen_id += 1 78 | except Exception as e: 79 | print("Get a error with ", e) 80 | continue 81 | if __name__ == '__main__': 82 | connec = connec_mysql() 83 | # connec.process_act_movie() 84 | connec.process_movie_gen() 85 | -------------------------------------------------------------------------------- /ie/struct_to_rdf/movie_actor/get_ttl.bat: -------------------------------------------------------------------------------- 1 | @echo off&setlocal enabledelayedexpansion 2 | 3 | set db=baidu_baike 4 | set file=kg_demo_mapping_%db%.ttl 5 | 6 | call generate-mapping -u root -p root -o %file% jdbc:mysql:///%db%?useSSL=false 7 | 8 | :: call findstr /i /v /C:"@prefix vocab" "%file%">>%file%.bk 9 | :: move /y %file%.bk %file% 10 | 11 | for /f "tokens=1,* delims=:" %%b in ('findstr /n ".*" "%file%"')do ( 12 | set "var=%%c" 13 | if "!var!" neq "@prefix vocab: ." ( 14 | if "!var!" equ "" ( 15 | >>%file%.bk echo,!var!) ^ 16 | else if "!var!" equ "@prefix jdbc: ." ( 17 | >>%file%.bk echo,!var! 18 | >>%file%.bk echo,@prefix : ^ .) ^ 19 | else ( 20 | echo;"!var!"|find "jdbcDSN"&&( 21 | >>%file%.bk echo, d2rq:jdbcDSN ^"jdbc:mysql:///%db%?useUnicode=true^&characterEncoding=utf8^&useSSL=false^";)||( 22 | set "var=!var:vocab= !" 23 | set "var=!var:actor_actor=actor!" 24 | set "var=!var:movie_movie=movie!" 25 | set "var=!var:genre_genre=genre!" 26 | set "var=!var:class :actor=class :Actor!" 27 | set "var=!var:class :movie=class :Movie!" 28 | set "var=!var:class :genre=class :Genre!" 29 | set "var=!var:property :actor_to_movie=property :hasActedIn!" 30 | set "var=!var:property :movie_to_genre=property :hasGenre!" 31 | >>%file%.bk echo,!var!)) 32 | ) 33 | ) 34 | move /y %file%.bk %file% -------------------------------------------------------------------------------- /ie/struct_to_rdf/movie_actor/get_ttl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Downloaing d2rq tools" 4 | wget https://github.com/downloads/d2rq/d2rq/d2rq-0.8.1.tar.gz; 5 | echo"Done" 6 | tar -xvzf d2rq-0.8.1.tar.gz; 7 | cd d2rq-0.8.1; 8 | for x in {hudong_baike,baidu_baike}; do 9 | echo "Generating ttl and nt files for $x" 10 | name_ttl=`echo "kg_demo_mapping_$x.ttl"` 11 | name_nt=`echo "$x.nt"` 12 | ./generate-mapping -u root -p nlp -o $name_ttl jdbc:mysql:///$x; 13 | sed -i '/\@prefix vocab.* \./d' $name_ttl # delete vocab prefix 14 | sed -i 's/vocab/ /g' $name_ttl 15 | sed -i 's/actor_actor/actor/g' $name_ttl 16 | sed -i 's/d2rq\:jdbcDSN "jdbc\:mysql.*;/d2rq\:jdbcDSN "jdbc\:mysql\:\/\/\/hudong_baike\?useUnicode=true\&characterEncoding=utf8";/g' $name_ttl 17 | sed -i '8a \@prefix : .' $name_ttl; 18 | ./dump-rdf -o $name_nt $name_ttl; # get NTriples 19 | done 20 | 21 | if [ $? -ne 0 ]; then 22 | echo "Generate mapping and nt files failed. Terminated." 23 | exit 1 24 | -------------------------------------------------------------------------------- /img/actor_movie_genre.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/actor_movie_genre.png -------------------------------------------------------------------------------- /img/baike.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/baike.png -------------------------------------------------------------------------------- /img/example_REfO_KBQA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_REfO_KBQA.png -------------------------------------------------------------------------------- /img/example_d2rq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_d2rq.png -------------------------------------------------------------------------------- /img/example_elastic_ss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_elastic_ss.png -------------------------------------------------------------------------------- /knowledge_fusion/silk/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /knowledge_fusion/silk/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /knowledge_fusion/silk/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /knowledge_fusion/silk/.idea/silk.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /knowledge_fusion/silk/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 27 | 28 | 29 | 30 | 31 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 1573547510765 64 | 69 | 70 | 71 | 72 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /knowledge_fusion/silk/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import requests 5 | import commands 6 | import math 7 | from tqdm import tqdm 8 | from batch_link import * 9 | import time 10 | import os 11 | import subprocess 12 | import argparse 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--fuseki', type=str, 16 | default='/home1/peng/project/apache-jena-fuseki-3.7.0/', help='Path to fuseki-server') 17 | parser.add_argument('--baiduNt', type=str , 18 | default="/home1/peng/project/d2rq-0.8.1/full_nt/baidu_baike.nt" , help='Path to baidu N-triples ') 19 | parser.add_argument('--hudongNt', type=str , 20 | default="/home1/peng/project/d2rq-0.8.1/full_nt/hudong_baike.nt" , help='Path to hudong N-triples') 21 | parser.add_argument('--maxNtLength', type=float , 22 | default=5000000.0 , help='Max N-triples in each nt file') 23 | parser.add_argument('--ip', type=str , 24 | default='localhost' , help='Ip for Fuseki and Silk server') 25 | parser.add_argument('--projectName', type=str , 26 | default='baike' , help='Silk project name') 27 | args = parser.parse_args() 28 | 29 | 30 | if __name__ == "__main__": 31 | baidu_name = [] 32 | hudong_name = [] 33 | jm = JenaCmd() 34 | nts = [args.baiduNt, args.hudongNt] 35 | # nts = ["/home1/peng/project/d2rq-0.8.1/1_nt/baidu_1.nt", "/home1/peng/project/d2rq-0.8.1/1_nt/hudong_1.nt"] 36 | subprocess.Popen(['sh', os.path.join(args.fuseki, 'fuseki-server')], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 37 | time.sleep(5) 38 | for idx, nt in enumerate(nts): 39 | print("nt", nt) 40 | out_file_list = seg_nt(nt, max_len=args.maxNtLength) 41 | for file in out_file_list: 42 | dbName = file.split("/")[-1].strip(".nt") 43 | if idx == 0: 44 | baidu_name.append(dbName) 45 | else: 46 | hudong_name.append(dbName) 47 | jm.delete_tdb(dbName=dbName) 48 | jm.add_tdb(dbName=dbName) 49 | JenaCmd.load_nt("./tdb_" + dbName, file) 50 | status, out = commands.getstatusoutput('cp {}/* {}'.format("./tdb_" + dbName, os.path.join(args.fuseki, "run/databases/") + dbName)) 51 | print(out) 52 | # restart fuseki server 53 | _, out = commands.getstatusoutput("netstat -tunlp|grep 3030") 54 | uid = out.split()[-1].strip("/java')") 55 | _, _ = commands.getstatusoutput("kill {}".format(uid)) 56 | subprocess.Popen(['sh', os.path.join(args.fuseki, 'fuseki-server')], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 57 | time.sleep(10) 58 | 59 | sm = SilkCmd() 60 | # delete old project and build new project 61 | sm.control_project(project_name=args.projectName, action="DELETE") 62 | sm.control_project(project_name=args.projectName) 63 | # add prefixes to project 64 | prefixes = {"baidu": "http://www.kgbaidu.com#", 65 | "hudong": "http://www.kghudong.com#"} 66 | sm.add_prefix(prefixes) 67 | # add Sparql endpoint datasets 68 | for dname in hudong_name + baidu_name: 69 | print(sm.build_endPoint(dname, "http://{}:3030/{}/query".format(args.ip, dname), "500000")) 70 | 71 | # build linking task 72 | linking_rule = '' 73 | for hname in hudong_name: 74 | for bname in baidu_name: 75 | # odata_file means output rdf file 76 | # o_rdf is the name of output dataset name corresponding to odata_file 77 | odata_file = "o_" + hname + bname + ".nt" 78 | o_rdf = "d_" + hname + bname 79 | task_name = "t_" + hname + bname 80 | sm.build_output(odata_file) 81 | sm.build_rdf(o_rdf, odata_file) 82 | sm.build_task(project_name=args.projectName, task_name=task_name, source_data=bname, target_data=hname, output_data=o_rdf) 83 | 84 | for hname in hudong_name: 85 | for bname in baidu_name: 86 | odata_file = "o_" + hname + bname + ".nt" 87 | o_rdf = "d_" + hname + bname 88 | task_name = "t_" + hname + bname 89 | print(sm.add_rule(linking_rule, project_name=args.projectName, task_name=task_name)) 90 | print("task_name: ", type(task_name), task_name) 91 | sm = SilkCmd() 92 | print(sm.control_linking(project_name=args.projectName, task_name=task_name)) 93 | time.sleep(60*60*5) 94 | -------------------------------------------------------------------------------- /requirement.text: -------------------------------------------------------------------------------- 1 | BeautifulSoup4 2 | chardet 3 | pymysql 4 | sparqlwrapper 5 | jieba 6 | refo 7 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/semantic_search/elasticsearch/data/__init__.py -------------------------------------------------------------------------------- /semantic_search/elasticsearch/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | from utils import views 8 | 9 | if __name__ == '__main__': 10 | while True: 11 | question = input() 12 | answer = views.search(question.encode('utf-8')) 13 | print("Your question is : ", question, "\nAnswer: ", answer) 14 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/semantic_search/elasticsearch/utils/__init__.py -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/build_dict.py: -------------------------------------------------------------------------------- 1 | import ahocorasick 2 | import pickle 3 | from collections import defaultdict 4 | 5 | entity_list_file = './data/all_entity.txt' 6 | entity_out_path = './data/ent_ac.pkl' 7 | attr_list_file = './data/attr_mapping.txt' 8 | attr_out_path = './data/attr_ac.pkl' 9 | val_list_file = './data/Person_val.txt' 10 | 11 | 12 | def dump_ac_entity_dict(list_file, out_path): 13 | A = ahocorasick.Automaton() 14 | f = open(list_file) 15 | i = 0 16 | for line in f: 17 | word = line.strip() 18 | A.add_word(word, (i, word)) 19 | i += 1 20 | A.make_automaton() 21 | pickle.dump(A, open(out_path, "wb")) 22 | 23 | 24 | def dump_ac_attr_dict(attr_mapping_file, out_path): 25 | A = ahocorasick.Automaton() 26 | f = open(attr_mapping_file) 27 | i = 0 28 | for line in f: 29 | parts = line.strip().split(" ") 30 | for p in parts: 31 | if p != "": 32 | A.add_word(p, (i, p)) 33 | i += 1 34 | A.make_automaton() 35 | pickle.dump(A, open(out_path, 'wb')) 36 | 37 | 38 | def load_ac_dict(out_path): 39 | A = pickle.load(open(out_path, "rb")) 40 | return A 41 | 42 | 43 | def load_attr_map(attr_mapping_file): 44 | f = open(attr_mapping_file) 45 | mapping = defaultdict(list) 46 | for line in f: 47 | parts = line.strip().split(" ") 48 | for p in parts: 49 | if p != '': 50 | mapping[p].append(parts[0]) 51 | return mapping 52 | 53 | 54 | def load_entity_dict(entity_file): 55 | f = open(entity_file) 56 | ents = {} 57 | for line in f: 58 | ents[line.strip()] = 1 59 | return ents 60 | 61 | 62 | def load_val_dict(val_file): 63 | f = open(val_file) 64 | val_attr_map = {} 65 | for line in f: 66 | parts = line.strip().split(" ") 67 | if line == "\n" or len(parts) < 2: 68 | continue 69 | new_str = u" ".join(parts[0:len(parts) - 1]).encode('utf-8') 70 | val_attr_map[u" ".join(parts[0:len(parts) - 1]).encode('utf-8')] = parts[-1] 71 | return val_attr_map 72 | 73 | 74 | if __name__ == '__main__': 75 | dump_ac_attr_dict(attr_list_file, attr_out_path) 76 | # load_val_dict(val_list_file) 77 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/get_ac_attr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import ahocorasick 4 | import pickle 5 | from collections import defaultdict 6 | 7 | 8 | def dump_ac_attr_dict(attr_mapping_file='../data/attr_mapping.txt', out_path='../data/attr_ac.pkl'): 9 | A = ahocorasick.Automaton() 10 | f = open(attr_mapping_file,'r',encoding='UTF-8') 11 | i = 0 12 | for line in f: 13 | parts = line.strip().split(" ") 14 | for p in parts: 15 | if p != "": 16 | A.add_word(p, (i, p)) 17 | i += 1 18 | A.make_automaton() 19 | pickle.dump(A, open(out_path, 'wb')) 20 | 21 | 22 | if __name__ == '__main__': 23 | dump_ac_attr_dict() 24 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/get_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | try: 8 | import simplejson as json 9 | except: 10 | import json 11 | 12 | import pymysql 13 | from pymysql import connections 14 | from collections import defaultdict 15 | 16 | 17 | class connec_mysql(object): 18 | def __init__(self): 19 | self.conn = pymysql.connect( 20 | host='localhost', 21 | user='root', 22 | passwd='root', 23 | db='baidu_baike', 24 | charset='utf8mb4', 25 | use_unicode=True 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def select_from_db(self, target_item, target_table, target_condition, target_value): 30 | self.cursor.execute("SELECT %s FROM %s WHERE %s = %s", 31 | (target_item, target_table, target_condition, target_value)) 32 | result = self.cursor.fetchall() 33 | return result 34 | 35 | def get_json(self): 36 | for cate in ["actor", "movie"]: 37 | cate = cate.strip() 38 | self.cursor.execute("SELECT MAX({}_id) FROM {}".format(cate, cate)) 39 | result = self.cursor.fetchall() 40 | max_id = result[0][0] if result[0][0] != None else 0 41 | print("max_id: ", max_id) 42 | f = open("{}.json".format(cate), "w+") 43 | for id in range(1, max_id + 1): 44 | self.cursor.execute("SELECT * FROM {} WHERE {}_id = {}".format(cate, cate, id)) 45 | item_lists = self.cursor.fetchall() 46 | # self.cursor.execute("SELECT COLUMN FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}'".format(cate)) 47 | actor_column_attr = ["actor_id", "actor_bio", "actor_chName", "actor_foreName", "actor_nationality", 48 | "actor_constellation", "actor_birthPlace", "actor_birthDay", "actor_repWorks", 49 | "actor_achiem", "actor_brokerage"] 50 | movie_column_attr = ["movie_id", "movie_bio", "movie_chName", "movie_foreName", "movie_prodTime", 51 | "movie_prodCompany", "movie_director", "movie_screenwriter", "movie_genre", 52 | "movie_star", "movie_length", "movie_rekeaseTime", "movie_language", 53 | "movie_achiem"] 54 | column_attr = actor_column_attr if cate == "actor" else movie_column_attr 55 | 56 | if item_lists == None and column_attr == None: 57 | continue 58 | try: 59 | assert len(item_lists[0]) == 14 or len(item_lists[0]) == 11 60 | item_dict = defaultdict(list) 61 | item_dict["subj"] = str(item_lists[0][2]) 62 | list_po = [] 63 | for i in range(1, len(item_lists[0])): 64 | if column_attr[i] == "{}_chName".format(cate): # skip actor_chName 65 | continue 66 | tmp_dict = {} 67 | tmp_dict["pred"] = column_attr[i] 68 | tmp_dict["obj"] = item_lists[0][i] 69 | list_po.append(tmp_dict) 70 | item_dict["po"] = list_po 71 | item_json = json.dumps(item_dict) 72 | f.write(item_json + "\n") 73 | 74 | except Exception as e: 75 | print(e) 76 | 77 | 78 | if __name__ == "__main__": 79 | connect_sql = connec_mysql() 80 | connect_sql.get_json() 81 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/get_total_val.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | try: 8 | import simplejson as json 9 | except: 10 | import json 11 | 12 | import pymysql 13 | from pymysql import connections 14 | from collections import defaultdict 15 | 16 | 17 | class connec_mysql(object): 18 | def __init__(self): 19 | self.conn = pymysql.connect( 20 | host='localhost', 21 | user='root', 22 | passwd='root', 23 | db='baidu_baike', 24 | charset='utf8mb4', 25 | use_unicode=True 26 | ) 27 | self.cursor = self.conn.cursor() 28 | 29 | def get_json(self): 30 | for cate in ["actor", "movie"]: 31 | cate = cate.strip() 32 | self.cursor.execute("SELECT MAX({}_id) FROM {}".format(cate, cate)) 33 | result = self.cursor.fetchall() 34 | max_id = result[0][0] if result[0][0] != None else 0 35 | print("max_id: ", max_id) 36 | f = open("../data/{}.txt".format(cate), "w+",encoding='utf-8') 37 | for id in range(1, max_id + 1): 38 | self.cursor.execute("SELECT * FROM {} WHERE {}_id = {}".format(cate, cate, id)) 39 | item_lists = self.cursor.fetchall() 40 | # self.cursor.execute("SELECT COLUMN FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}'".format(cate)) 41 | actor_column_attr = ["actor_id", "actor_bio", "actor_chName", "actor_foreName", "actor_nationality", 42 | "actor_constellation", "actor_birthPlace", "actor_birthDay", "actor_repWorks", 43 | "actor_achiem", "actor_brokerage"] 44 | movie_column_attr = ["movie_id", "movie_bio", "movie_chName", "movie_foreName", "movie_prodTime", 45 | "movie_prodCompany", "movie_director", "movie_screenwriter", "movie_genre", 46 | "movie_star", "movie_length", "movie_rekeaseTime", "movie_language", 47 | "movie_achiem"] 48 | column_attr = actor_column_attr if cate == "actor" else movie_column_attr 49 | 50 | if item_lists == None and column_attr == None: 51 | continue 52 | try: 53 | assert len(item_lists[0]) == 14 or len(item_lists[0]) == 11 54 | for i in range(1, len(item_lists[0])): 55 | if item_lists[0][i] == 'None': 56 | continue 57 | f.write(item_lists[0][i] + " " + column_attr[i] + "\n") 58 | 59 | except Exception as e: 60 | print(e) 61 | 62 | 63 | if __name__ == "__main__": 64 | connect_sql = connec_mysql() 65 | connect_sql.get_json() 66 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/insert.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | 将一个知识图谱中的数据导入elastic search,须提前新建index和type 4 | ''' 5 | try: 6 | import simplejson as json 7 | except: 8 | import json 9 | import sys 10 | import requests 11 | 12 | def bulk_insert(base_url, data): 13 | response = requests.post(base_url, headers={"Content-Type":"application/x-ndjson"}, data=data) 14 | 15 | def begin_insert_job(index_name, type_name, json_filepath, bulk_size=1000): 16 | base_url = "http://localhost:9200/" + index_name + "/" + type_name + "/_bulk" 17 | f = open(json_filepath) 18 | cnt, es_id = 0, 1 19 | data = "" 20 | for line in f: 21 | action_meta = '{"index": {"_id":"' + str(es_id) + '"}}' 22 | data = data + action_meta + "\n" + line 23 | 24 | es_id += 1 25 | cnt += 1 26 | if cnt >= bulk_size: 27 | bulk_insert(base_url, data) 28 | cnt, data = 0, "" 29 | if not (es_id % bulk_size): 30 | print(es_id) 31 | if cnt: 32 | bulk_insert(base_url, data) 33 | 34 | if __name__ == '__main__': 35 | begin_insert_job("demo", "_doc", "../data/baidu_baike.json") 36 | -------------------------------------------------------------------------------- /semantic_search/elasticsearch/utils/query_cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -XGET 'localhost:9200/demo/baidu_baike/_search?&pretty' -H 'Content-Type:application/json' -d' 4 | { 5 | "query":{ 6 | "bool":{ 7 | "filter":{ 8 | "term":{"subj":"朱一龙"} 9 | } 10 | } 11 | } 12 | } 13 | ' 14 | 15 | --------------------------------------------------------------------------------