├── .gitignore
├── KBQA
    └── patternREfO
    │   ├── data
    │       ├── actorName.txt
    │       ├── get_dict.txt
    │       └── movieName.txt
    │   ├── get_dict.sh
    │   ├── query.py
    │   └── utils
    │       ├── __init__.py
    │       ├── rules.py
    │       └── word_tagging.py
├── README.md
├── ie
    ├── craw
    │   ├── baidu_baike
    │   │   ├── baidu_baike
    │   │   │   ├── __init__.py
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── baidu_baike.py
    │   │   └── scrapy.cfg
    │   ├── craw_all_baidu
    │   │   ├── baidu_baike
    │   │   │   ├── __init__.py
    │   │   │   ├── commands
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── crawlall.py
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   ├── setup.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── baidu_baike-10.py
    │   │   │   │   ├── baidu_baike-2.py
    │   │   │   │   ├── baidu_baike-3.py
    │   │   │   │   ├── baidu_baike-4.py
    │   │   │   │   ├── baidu_baike-5.py
    │   │   │   │   ├── baidu_baike-6.py
    │   │   │   │   ├── baidu_baike-7.py
    │   │   │   │   ├── baidu_baike-8.py
    │   │   │   │   ├── baidu_baike-9.py
    │   │   │   │   └── baidu_baike.py
    │   │   ├── creat_mysql.md
    │   │   └── scrapy.cfg
    │   ├── craw_all_hudong
    │   │   ├── craw_all_hudong
    │   │   │   ├── __init__.py
    │   │   │   ├── commands
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── crawlall.py
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   ├── setup.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── hudong_baike.py
    │   │   ├── creat_mysql.md
    │   │   └── scrapy.cfg
    │   ├── craw_without_spider
    │   │   ├── mysql
    │   │   │   ├── creat_sql.txt
    │   │   │   └── help_mysql.txt
    │   │   └── utils
    │   │   │   ├── basic_info.py
    │   │   │   ├── craw.py
    │   │   │   └── kg_movie_movie.sql
    │   ├── hudong_baike
    │   │   ├── hudong_baike
    │   │   │   ├── __init__.py
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── hudong_baike.py
    │   │   └── scrapy.cfg
    │   ├── news_spider
    │   │   ├── news
    │   │   │   └── __init__
    │   │   ├── news_spider
    │   │   │   ├── __init__.py
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── huxiu_spider.py
    │   │   ├── readme.md
    │   │   └── scrapy.cfg
    │   └── weixin_spider
    │   │   ├── scrapy.cfg
    │   │   └── weixin_spider
    │   │       ├── __init__.py
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │           ├── __init__.py
    │   │           └── weixin_spiders.py
    ├── deepdive
    │   ├── app.ddlog
    │   ├── db.url
    │   ├── deepdive.conf
    │   ├── input
    │   │   └── __init__.py
    │   ├── start_posql.sh
    │   └── udf
    │   │   ├── __init__.py
    │   │   ├── baidu_baike
    │   │       ├── baidu_baike
    │   │       │   ├── __init__.py
    │   │       │   ├── items.py
    │   │       │   ├── middlewares.py
    │   │       │   ├── pipelines.py
    │   │       │   ├── settings.py
    │   │       │   └── spiders
    │   │       │   │   ├── __init__.py
    │   │       │   │   └── baidu_baike.py
    │   │       └── scrapy.cfg
    │   │   ├── extract_play_features.py
    │   │   ├── get_actor_movie.py
    │   │   ├── map_actor_mention.py
    │   │   ├── map_movie_mention.py
    │   │   ├── map_play_candidate.py
    │   │   ├── nlp_markup.sh
    │   │   ├── supervise_play.py
    │   │   └── trans.py
    ├── re_cnn_att
    │   ├── clean.py
    │   ├── data
    │   │   └── __init__.py
    │   ├── gen_re_from_baidu.py
    │   └── word2vec.py
    └── struct_to_rdf
    │   ├── baidu2neo4j
    │       ├── __init__.py
    │       ├── clean.py
    │       ├── cleanFile.py
    │       ├── gen_disambi_infobox.py
    │       ├── get_subject.py
    │       ├── header_file
    │       │   ├── disambi_headers.csv
    │       │   ├── disambi_infobox_header.csv
    │       │   ├── disambi_redirect_header.csv
    │       │   ├── disambi_subject_header.csv
    │       │   ├── redirect_header.csv
    │       │   ├── subject_header.csv
    │       │   ├── title_disambi_header.csv
    │       │   └── title_header.csv
    │       └── remove_disambi.py
    │   └── movie_actor
    │       ├── clean_actor.py
    │       ├── clean_mysql.py
    │       ├── complete_mysql.py
    │       ├── get_ttl.bat
    │       ├── get_ttl.sh
    │       ├── kg_demo_mapping_baidu_baike.ttl
    │       └── kg_movie_tultle.owl
├── img
    ├── actor_movie_genre.png
    ├── baike.png
    ├── example_REfO_KBQA.png
    ├── example_d2rq.png
    └── example_elastic_ss.png
├── knowledge_fusion
    └── silk
    │   ├── .idea
    │       ├── inspectionProfiles
    │       │   └── profiles_settings.xml
    │       ├── misc.xml
    │       ├── modules.xml
    │       ├── silk.iml
    │       └── workspace.xml
    │   ├── batch_link.py
    │   └── run.py
├── requirement.text
└── semantic_search
    └── elasticsearch
        ├── data
            └── __init__.py
        ├── query.py
        └── utils
            ├── __init__.py
            ├── build_dict.py
            ├── get_ac_attr.py
            ├── get_json.py
            ├── get_total_val.py
            ├── insert.py
            ├── query_cmd.sh
            └── views.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | KBQA/actorName.txt 
 2 | KBQA/movieName.txt
 3 | *.json
 4 | *.pyc
 5 | semantic_search/elasticsearch/data/*.txt
 6 | semantic_search/elasticsearch/data/attr_ac.pkl 
 7 | ie/deepdive/udf/bazaar/*
 8 | ie/deepdive/*.txt
 9 | ie/deepdive/run/*
10 | ie/deepdive/*.csv
11 | *.txt
12 | *.csv
13 | ie/craw/craw_all_baidu/craws/*
14 | ie/re_cnn_att/data/*.csv
15 | ie/re_cnn_att/data/*.txt
16 | ie/re_cnn_att/data/*.pkl
17 | ie/re_cnn_att/data/*.json
18 | ie/re_cnn_att/thirdpart/*
19 | 


--------------------------------------------------------------------------------
/KBQA/patternREfO/data/get_dict.txt:
--------------------------------------------------------------------------------
1 | use baidu_baike;
2 | select actor_chName from actor into outfile '/var/lib/mysql-files/baidu_actorName.txt';
3 | select movie_chName from movie into outfile '/var/lib/mysql-files/baidu_movieName.txt';
4 | use hudong_baike;
5 | select actor_chName from actor into outfile '/var/lib/mysql-files/hudong_actorName.txt';
6 | select movie_chName from movie into outfile '/var/lib/mysql-files/hudong_movieName.txt';
7 | 


--------------------------------------------------------------------------------
/KBQA/patternREfO/get_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Export dict for movie and actor in hudong and baidu DB;
 4 | # You need change the user and pwd for your own DB;
 5 | mysql -uroot -pnlp < ./data/get_dict.txt 
 6 | 
 7 | sudo cp /var/lib/mysql-files/*Name.txt .
 8 | 
 9 | cat baidu_actorName.txt hudong_actorName.txt | sort -u > actorTmp.txt
10 | cat baidu_movieName.txt hudong_movieName.txt | sort -u > movieTmp.txt
11 | # Append "nz" and "nr" tag for jieba
12 | awk '{print $0 " nr"}' actorTmp.txt > actorName.txt
13 | awk '{print $0 " nz"}' movieTmp.txt > movieName.txt
14 | 
15 | # Remove redundant file
16 | rm ^[am].*Name.txt
17 | 


--------------------------------------------------------------------------------
/KBQA/patternREfO/query.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from SPARQLWrapper import SPARQLWrapper, JSON
 5 | from utils.word_tagging import Tagger
 6 | from utils.rules import customize_rules
 7 | 
 8 | if __name__ == "__main__":
 9 |     print("init...........")
10 |     sparql_base = SPARQLWrapper("http://localhost:3030/kg_movie/query")
11 |     tagger = Tagger(['data/actorName.txt', 'data/movieName.txt'])
12 |     rules = customize_rules()
13 |     print("done \n")
14 | 
15 |     while True:
16 |         print("Please input your question: ")
17 |         default_question = input()
18 |         seg_list = tagger.get_word_objects(default_question)
19 | 
20 |         for rule in rules:
21 |             query = rule.apply(seg_list)
22 |             if query:
23 |                 sparql_base.setQuery(query)
24 |                 sparql_base.setReturnFormat(JSON)
25 |                 results = sparql_base.query().convert()
26 |     
27 |                 if not results["results"]["bindings"]:
28 |                     print("No answer found :(")
29 |                     continue
30 |                 for result in results["results"]["bindings"]:
31 |                     print("Result: ", result["x0"]["value"])
32 | 


--------------------------------------------------------------------------------
/KBQA/patternREfO/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/KBQA/patternREfO/utils/__init__.py


--------------------------------------------------------------------------------
/KBQA/patternREfO/utils/rules.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import re
  5 | from refo import finditer, Predicate, Star, Any
  6 | 
  7 | # SPARQL config  
  8 | SPARQL_PREAMBLE = u"""  
  9 | PREFIX : <http://www.kgdemo.com#>
 10 | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 11 | PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 12 | """              
 13 |                  
 14 | SPARQL_TEM = u"{preamble}\n" + \
 15 |              u"SELECT DISTINCT {select} WHERE {{\n" + \
 16 |              u"{expression}\n" + \
 17 |              u"}}\n"    
 18 |                  
 19 | INDENT = "    "  
 20 | 
 21 | class W(Predicate):
 22 |     """object-oriented regex for words"""
 23 |     def __init__(self, token=".*", pos=".*"):
 24 |         self.token = re.compile(token + "$")
 25 |         self.pos = re.compile(pos + "$")
 26 |         super(W, self).__init__(self.match)
 27 |                    
 28 |     def match(self, word):
 29 |         m1 = self.token.match(word.token)
 30 |         m2 = self.pos.match(word.pos)
 31 |         return m1 and m2
 32 | 
 33 | class Rule(object):
 34 |     def __init__(self, condition=None, action=None):
 35 |         assert condition and action
 36 |         self.condition = condition
 37 |         self.action = action
 38 |             
 39 |     def apply(self, sentence):
 40 |         matches = []
 41 |         for m in finditer(self.condition, sentence):
 42 |             i, j = m.span()
 43 |             matches.extend(sentence[i:j])
 44 |         if __name__ == '__main__':
 45 |             pass
 46 |         return self.action(matches)
 47 | 
 48 | def who_is_question(x):
 49 |     select = u"?x0"
 50 |             
 51 |     sparql = None
 52 |     for w in x:
 53 |         if w.pos == "nr" or w.pos == "x":
 54 |             e = u" ?a :actor_chName '{person}'. \n \
 55 |             ?a :actor_bio ?x0".format(person=w.token)
 56 |          
 57 |             sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
 58 |                                        select=select,
 59 |                                        expression=INDENT + e)
 60 |             break   
 61 |     return sparql 
 62 | 
 63 | def where_is_from_question(x):
 64 |     select = u"?x0"
 65 |  
 66 |     sparql = None 
 67 |     for w in x:   
 68 |         if w.pos == "nr" or w.pos == "x" or w.pos == "nrt":
 69 |             e = u" ?a :actor_chName '{person}'.\n \
 70 |             ?a :actor_birthPlace ?x0".format(person=w.token)
 71 |  
 72 |             sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
 73 |                                        select=select,
 74 |                                        expression=INDENT + e)
 75 |             break
 76 |     return sparql
 77 |  
 78 |  
 79 | def movie_intro_question(x):
 80 |     select = u"?x0"
 81 |  
 82 |     sparql = None
 83 |     for w in x:
 84 |         if w.pos == "nz":
 85 |             e = u" ?a :movie_chName '{person}'. \n \
 86 |             ?a :movie_bio ?x0".format(person=w.token)
 87 | 
 88 |             sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE,
 89 |                                        select=select,
 90 |                                        expression=INDENT + e)
 91 |             break
 92 |     return sparql
 93 | 
 94 | def customize_rules():
 95 |     # some rules for matching
 96 |     # TODO: customize your own rules here
 97 |     person = (W(pos="nr") | W(pos="x") | W(pos="nrt") | W(pos="nz"))
 98 |     movie = (W(pos="nz"))
 99 |     place = (W("出生地") | W("出生"))
100 |     intro = (W("简介") | W(pos="介绍"))
101 |                                 
102 |     rules = [                   
103 |                                 
104 |         Rule(condition=W(pos="r") + W("是") + person | \
105 |                        person + W("是") + W(pos="r"),
106 |              action=who_is_question),
107 |                           
108 |         Rule(condition=person + Star(Any(), greedy=False) + place + Star(Any(), greedy=False),
109 |              action=where_is_from_question),
110 |                           
111 |         Rule(condition=movie + Star(Any(), greedy=False) + intro + Star(Any(), greedy=False) ,
112 |              action=movie_intro_question)
113 |                           
114 |     ]
115 |     return rules
116 | 


--------------------------------------------------------------------------------
/KBQA/patternREfO/utils/word_tagging.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | """
 4 | 
 5 | @author: SimmerChan
 6 | 
 7 | @contact: hsl7698590@gmail.com
 8 | 
 9 | @file: word_tagging.py
10 | 
11 | @time: 2017/12/20 15:31
12 | 
13 | @desc: 定义Word类的结构；定义Tagger类，实现自然语言转为Word对象的方法。
14 | 
15 | """
16 | import jieba
17 | import jieba.posseg as pseg
18 | 
19 | 
20 | class Word(object):
21 |     def __init__(self, token, pos):
22 |         self.token = token
23 |         self.pos = pos
24 | 
25 | 
26 | class Tagger:
27 |     def __init__(self, dict_paths):
28 |         # TODO 加载外部词典
29 |         for p in dict_paths:
30 |             jieba.load_userdict(p)
31 | 
32 |     def get_word_objects(self, sentence):
33 |         """
34 |         Get :class:WOrd(token, pos)
35 |         """
36 |         return [Word(bytes.decode(word.encode('utf-8')), tag) for word, tag in pseg.cut(sentence)]
37 | 
38 | if __name__ == '__main__':
39 |     tagger = Tagger(['../data/actorName.txt', '../data/movieName.txt'])
40 |     while True:
41 |         s = input()
42 |         print("tagger.get_word_objects(s): ", tagger.get_word_objects(s))
43 |         for i in tagger.get_word_objects(s):
44 |             print(i.token, i.pos)
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | **knowledge graph,从零开始构建知识图谱，涵盖基础知识、构建理论、构建实战，从理论到实现。**
 3 | 
 4 | ## 一、基础知识
 5 | 1. [知识图谱基础 之 一.知识图谱基本概念](https://www.ljjyy.com/archives/2019/11/100629.html)
 6 | 2. [知识图谱基础 之 二.知识表示与知识建模](https://www.ljjyy.com/archives/2019/11/100605.html)
 7 | 3. [知识图谱基础 之 三.知识抽取](https://www.ljjyy.com/archives/2019/11/100606.html)
 8 | 4. [知识图谱基础 之 四.知识挖掘](https://www.ljjyy.com/archives/2019/11/100607.html)
 9 | 5. [知识图谱基础 之 五.知识存储](https://www.ljjyy.com/archives/2019/11/100608.html)
10 | 6. [知识图谱基础 之 六.知识融合](https://www.ljjyy.com/archives/2019/11/100609.html)
11 | 7. [知识图谱基础 之 七.知识推理](https://www.ljjyy.com/archives/2019/11/100610.html)
12 | 8. [知识图谱基础 之 八.语义搜索](https://www.ljjyy.com/archives/2019/11/100611.html)
13 | 9. [知识图谱基础 之 九.知识问答](https://www.ljjyy.com/archives/2019/11/100612.html)
14 | 
15 | ## 二、论文方面(构建理论)
16 | 
17 | 论文主要推荐两篇文章
18 | 
19 | 1. 清华大学杨玉基的“[一种准确而高效的领域知识图谱构建方法](http://www.doc88.com/p-9979131856838.html)”。讲述了怎么通过4步进行半自动话的构建领域知识图谱，参考价值极大，步骤清晰。
20 | 
21 | 2. 华东理工大学胡芳槐的博士论文“[基于多种数据源的中文知识图谱构建方法研究](http://www.doc88.com/p-0784652186719.html)”，这篇文章讲了怎么通过多数据源去构建通用知识图谱和行业知识图谱，比较详细的介绍了一些构建技术，具备一定参考价值。
22 | 
23 | ## 三、博客方面(构建实战)
24 | 
25 | 《从零开始学习知识图谱》系列文章，通过实战码代码，一步一步教你怎么构建一个电影领域知识图谱及百科知识图谱。
26 | 1. [从零开始学习知识图谱（一）：电影知识图谱构建 1.半结构化数据的获取](https://www.ljjyy.com/archives/2019/10/100591.html)
27 | 2. [从零开始学习知识图谱（二）：电影知识图谱构建 2.结构化数据到RDF以及基于Apache jena交互](https://www.ljjyy.com/archives/2019/10/100592.html)
28 | 3. [从零开始学习知识图谱（三）：电影知识图谱构建 3.基于REfO的简单知识问答](https://www.ljjyy.com/archives/2019/10/100593.html)
29 | 4. [从零开始学习知识图谱（四）：电影知识图谱构建 4.基于ElasticSearch的简单语义搜索](https://www.ljjyy.com/archives/2019/10/100594.html)
30 | 5. [从零开始学习知识图谱（五）：电影知识图谱构建 5.基于Deepdive非结构化文本关系抽取](https://www.ljjyy.com/archives/2019/10/100595.html)
31 | 6. [从零开始学习知识图谱（六）：电影知识图谱构建 6.将关系型数据存入图数据库Neo4j](https://www.ljjyy.com/archives/2019/10/100596.html)
32 | 7. [从零开始学习知识图谱（七）：百科知识图谱构建 1.百科类知识抽取](https://www.ljjyy.com/archives/2019/10/100597.html)
33 | 8. [从零开始学习知识图谱（八）：百科知识图谱构建 2.数据清洗及存入图数据库Neo4j](https://www.ljjyy.com/archives/2019/10/100598.html)
34 | 9. [从零开始学习知识图谱（九）：百科知识图谱构建 3.基于TensorFlow神经网络关系抽取的数据集构建(使用OpenNRE)](https://www.ljjyy.com/archives/2019/10/100599.html)
35 | 10. [从零开始学习知识图谱（十）：百科知识图谱构建 4.结构化数据到RDF](https://www.ljjyy.com/archives/2019/10/100600.html)
36 | 11. [从零开始学习知识图谱（十一）：百科知识图谱构建 5.Jena使用及SPARQL查询](https://www.ljjyy.com/archives/2019/10/100601.html)
37 | 12. [从零开始学习知识图谱（十二）：百科知识图谱构建 6.基于Silk知识融合](https://www.ljjyy.com/archives/2019/10/100602.html)
38 | 13. [从零开始学习知识图谱（十三）：百科知识图谱构建 7.基于Silk批量知识融合](https://www.ljjyy.com/archives/2019/10/100603.html)
39 | 
40 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/baidu_baike/baidu_baike/__init__.py


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaiduBaikeItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # Actor
15 |     # 包含演员相关属性
16 |     actor_id = scrapy.Field()
17 |     actor_bio = scrapy.Field()
18 |     actor_chName = scrapy.Field()
19 |     actor_foreName = scrapy.Field()
20 |     actor_nationality = scrapy.Field()
21 |     actor_constellation = scrapy.Field()
22 |     actor_birthPlace = scrapy.Field()
23 |     actor_birthDay = scrapy.Field()
24 |     actor_repWorks = scrapy.Field()
25 |     actor_achiem = scrapy.Field()
26 |     actor_brokerage = scrapy.Field()
27 | 
28 |     # movie
29 |     # 电影相关属性
30 |     movie_id = scrapy.Field()
31 |     movie_bio = scrapy.Field()
32 |     movie_chName = scrapy.Field()
33 |     movie_foreName = scrapy.Field()
34 |     movie_prodTime = scrapy.Field()
35 |     movie_prodCompany = scrapy.Field()
36 |     movie_director = scrapy.Field()
37 |     movie_screenwriter = scrapy.Field()
38 |     movie_genre = scrapy.Field()
39 |     movie_star = scrapy.Field()
40 |     movie_length = scrapy.Field()
41 |     movie_rekeaseTime = scrapy.Field()
42 |     movie_language = scrapy.Field()
43 |     movie_achiem = scrapy.Field()
44 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | 
 12 | class WeixinSpiderSpiderMiddleware(object):
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, dict or Item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Response, dict
 44 |         # or Item objects.
 45 |         pass
 46 | 
 47 |     def process_start_requests(self, start_requests, spider):
 48 |         # Called with the start requests of the spider, and works
 49 |         # similarly to the process_spider_output() method, except
 50 |         # that it doesn’t have a response associated.
 51 | 
 52 |         # Must return only requests (not items).
 53 |         for r in start_requests:
 54 |             yield r
 55 | 
 56 |     def spider_opened(self, spider):
 57 |         spider.logger.info('Spider opened: %s' % spider.name)
 58 | 
 59 | 
 60 | class RandomUserAgent:
 61 |     def __init__(self, agents):
 62 |         self.agents = [
 63 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 64 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 65 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 67 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 68 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 69 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 70 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 71 |         ]
 72 | 
 73 |     @classmethod
 74 |     def from_crawler(cls, crawler):
 75 |         # 获取settings的USER_AGENT列表并返回
 76 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 77 | 
 78 |     def process_request(self, request, spider):
 79 |         # 随机设置Request报头header的User-Agent
 80 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 81 | 
 82 | 
 83 | # 添加代理
 84 | 
 85 | class ProxyMiddleWare(object):
 86 |     proxy_list = [
 87 |         "http://58.87.89.234:3128",
 88 |         "http://139.201.202.140:53281",
 89 |         "http://27.37.123.30:9000",
 90 |         "http://218.67.82.146:36709",
 91 |         "http://222.222.169.60:53281",
 92 |         "http://120.33.247.233:46884",
 93 |         "http://114.215.18.7:3128",
 94 |         "http://112.74.94.142:3128",
 95 |         "http://122.72.18.34:80",
 96 |         "http://36.33.25.123:808",
 97 |         "http://123.138.89.133:9999",
 98 |         "http://111.231.192.61:8080",
 99 |         "http://59.41.202.228:53281",
100 |         "http://222.241.14.187:8888",
101 |         "http://61.155.164.106:3128",
102 |         "http://27.40.156.43:61234",
103 |         "http://14.29.84.50:8080",
104 |         "http://116.25.100.62:9797",
105 |         "http://58.21.183.144:80",
106 |         "http://14.221.166.205:9000",
107 |         "http://115.231.50.10:53281",
108 |         "http://120.34.205.40:808",
109 |         "http://123.139.56.238:9999",
110 |         "http://113.116.170.232:9000",
111 |         "http://116.17.236.36:808",
112 |         "http://114.232.163.73:34837",
113 |         "http://171.35.103.37:808",
114 |         "http://27.46.51.232:9797",
115 |         "http://223.247.255.207:24714",
116 |         "http://223.241.117.179:8010",
117 |         "http://222.186.12.102:57624"]
118 | 
119 | 
120 | def process_request(self, request, spider):
121 |     # if not request.meta['proxies']:
122 |     ip = random.choice(self.proxy_list)
123 |     request.meta['proxy'] = ip
124 | 
125 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | #import sys
 12 | #from importlib import reload
 13 | 
 14 | #reload(sys)
 15 | #sys.setdefaultencoding('utf-8')
 16 | 
 17 | import pymysql
 18 | from pymysql import connections
 19 | from baidu_baike import settings
 20 | 
 21 | class BaiduBaikePipeline(object):
 22 |     def __init__(self):
 23 |         # 初始化并连接到mysql数据库
 24 |         self.conn = pymysql.connect(
 25 |             host=settings.HOST_IP,
 26 |             port=settings.PORT,
 27 |             user=settings.USER,
 28 |             passwd=settings.PASSWD,
 29 |             db=settings.DB_NAME,
 30 |             charset='utf8mb4',
 31 |             use_unicode=True
 32 |             )
 33 |         self.cursor = self.conn.cursor()
 34 | 
 35 |     def process_item(self, item, spider):
 36 |         # process info for actor
 37 |         actor_chName = str(item['actor_chName']).encode('utf-8')
 38 |         actor_foreName = str(item['actor_foreName']).encode('utf-8')
 39 |         movie_chName = str(item['movie_chName']).encode('utf-8')
 40 |         movie_foreName = str(item['movie_foreName']).encode('utf-8')
 41 | 
 42 |         if (item['actor_chName'] != None or item['actor_foreName'] != None) and item['movie_chName'] == None:
 43 |             actor_bio = str(item['actor_bio']).encode('utf-8')
 44 |             actor_nationality = str(item['actor_nationality']).encode('utf-8')
 45 |             actor_constellation = str(item['actor_constellation']).encode('utf-8')
 46 |             actor_birthPlace = str(item['actor_birthPlace']).encode('utf-8')
 47 |             actor_birthDay = str(item['actor_birthDay']).encode('utf-8')
 48 |             actor_repWorks = str(item['actor_repWorks']).encode('utf-8')
 49 |             actor_achiem = str(item['actor_achiem']).encode('utf-8')
 50 |             actor_brokerage = str(item['actor_brokerage']).encode('utf-8')
 51 | 
 52 |             self.cursor.execute("SELECT actor_chName FROM actor;")
 53 |             actorList = self.cursor.fetchall()
 54 |             if (actor_chName,) not in actorList :
 55 |                 # get the nums of actor_id in table actor
 56 |                 self.cursor.execute("SELECT MAX(actor_id) FROM actor")
 57 |                 result = self.cursor.fetchall()[0]
 58 |                 if None in result:
 59 |                     actor_id = 1
 60 |                 else:
 61 |                     actor_id = result[0] + 1
 62 |                 sql = """
 63 |                 INSERT INTO actor(actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 64 |                 """
 65 |                 self.cursor.execute(sql, (actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ))
 66 |                 self.conn.commit()
 67 |             else:
 68 |                 print("#" * 20, "Got a duplict actor!!", actor_chName)
 69 |         elif (item['movie_chName'] != None or item['movie_foreName'] != None) and item['actor_chName'] == None:
 70 |             movie_bio = str(item['movie_bio']).encode('utf-8')
 71 |             movie_prodTime = str(item['movie_prodTime']).encode('utf-8')
 72 |             movie_prodCompany = str(item['movie_prodCompany']).encode('utf-8')
 73 |             movie_director = str(item['movie_director']).encode('utf-8')
 74 |             movie_screenwriter = str(item['movie_screenwriter']).encode('utf-8')
 75 |             movie_genre = str(item['movie_genre']).encode('utf-8')
 76 |             movie_star = str(item['movie_star']).encode('utf-8')
 77 |             movie_length = str(item['movie_length']).encode('utf-8')
 78 |             movie_rekeaseTime = str(item['movie_rekeaseTime']).encode('utf-8')
 79 |             movie_language = str(item['movie_language']).encode('utf-8')
 80 |             movie_achiem = str(item['movie_achiem']).encode('utf-8')
 81 | 
 82 |             self.cursor.execute("SELECT movie_chName FROM movie;")
 83 |             movieList = self.cursor.fetchall()
 84 |             if (movie_chName,) not in movieList :
 85 |                 self.cursor.execute("SELECT MAX(movie_id) FROM movie")
 86 |                 result = self.cursor.fetchall()[0]
 87 |                 if None in result:
 88 |                     movie_id = 1
 89 |                 else:
 90 |                     movie_id = result[0] + 1
 91 |                 sql = """
 92 |                 INSERT INTO movie(  movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 93 |                 """
 94 |                 self.cursor.execute(sql, ( movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ))
 95 |                 self.conn.commit()
 96 |             else:
 97 |                 print("Got a duplict movie!!", movie_chName)
 98 |         else:
 99 |             print("Skip this page because wrong category!! ")
100 |         return item
101 |     def close_spider(self, spider):
102 |         self.conn.close()
103 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for baidu_baike project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'baidu_baike'
 13 | 
 14 | SPIDER_MODULES = ['baidu_baike.spiders']
 15 | NEWSPIDER_MODULE = 'baidu_baike.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)'
 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 3
 30 | # import random
 31 | # DOWNLOAD_DELAY = random.randint(0, 1)
 32 | # The download delay setting will honor only one of:
 33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 34 | # CONCURRENT_REQUESTS_PER_IP = 16
 35 | 
 36 | # Disable cookies (enabled by default)
 37 | # COOKIES_ENABLED = False
 38 | 
 39 | # Disable Telnet Console (enabled by default)
 40 | # TELNETCONSOLE_ENABLED = False
 41 | 
 42 | # Override the default request headers:
 43 | # DEFAULT_REQUEST_HEADERS = {
 44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 | #   'Accept-Language': 'en',
 46 | # }
 47 | 
 48 | # Enable or disable spider middlewares
 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 50 | # SPIDER_MIDDLEWARES = {
 51 | #    'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543,
 52 | # }
 53 | 
 54 | # Enable or disable downloader middlewares
 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 56 | # DOWNLOADER_MIDDLEWARES = {
 57 | #    'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543,
 58 | # }
 59 | DOWNLOADER_MIDDLEWARES = {
 60 |     'baidu_baike.middlewares.RandomUserAgent': 10,
 61 |     'baidu_baike.middlewares.ProxyMiddleWare': 100,
 62 | }
 63 | # Enable or disable extensions
 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 65 | # EXTENSIONS = {
 66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 67 | # }
 68 | 
 69 | # Configure item pipelines
 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 71 | # ITEM_PIPELINES = {
 72 | #    'baidu_baike.pipelines.BaiduBaikePipeline': 300,
 73 | # }
 74 | ITEM_PIPELINES = {
 75 |     'baidu_baike.pipelines.BaiduBaikePipeline': 300,
 76 | }
 77 | 
 78 | # HOST_IP = 'localhost'
 79 | # PORT = 3306
 80 | # USER = 'root'
 81 | # PASSWD = 'root'
 82 | # DB_NAME = 'kg_movie'
 83 | HOST_IP = 'localhost'
 84 | PORT = 3306
 85 | USER = 'root'
 86 | PASSWD = 'root'
 87 | DB_NAME = 'baidu_baike'
 88 | # Enable and configure the AutoThrottle extension (disabled by default)
 89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 90 | # AUTOTHROTTLE_ENABLED = True
 91 | # The initial download delay
 92 | # AUTOTHROTTLE_START_DELAY = 5
 93 | # The maximum download delay to be set in case of high latencies
 94 | # AUTOTHROTTLE_MAX_DELAY = 60
 95 | # The average number of requests Scrapy should be sending in parallel to
 96 | # each remote server
 97 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 98 | # Enable showing throttling stats for every response received:
 99 | # AUTOTHROTTLE_DEBUG = False
100 | 
101 | # Enable and configure HTTP caching (disabled by default)
102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
103 | # HTTPCACHE_ENABLED = True
104 | # HTTPCACHE_EXPIRATION_SECS = 0
105 | # HTTPCACHE_DIR = 'httpcache'
106 | # HTTPCACHE_IGNORE_HTTP_CODES = []
107 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
108 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/baidu_baike/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/craw/baidu_baike/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = baidu_baike.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baidu_baike
12 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_baidu/baidu_baike/__init__.py


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_baidu/baidu_baike/commands/__init__.py


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from scrapy.commands import ScrapyCommand
 5 | from scrapy.crawler import CrawlerRunner
 6 | from scrapy.exceptions import UsageError
 7 | from scrapy.utils.project import get_project_settings
 8 | from scrapy.crawler import Crawler
 9 | from scrapy.utils.conf import arglist_to_dict
10 |  
11 | class Command(ScrapyCommand):
12 |  
13 |     requires_project = True
14 |  
15 |     def syntax(self):
16 |         return '[options]'
17 |  
18 |     def short_desc(self):
19 |         return 'Runs all of the spiders'
20 | 
21 |     def add_options(self, parser):
22 |         ScrapyCommand.add_options(self, parser)
23 |         parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
24 |                           help="set spider argument (may be repeated)")
25 |         parser.add_option("-o", "--output", metavar="FILE",
26 |                           help="dump scraped items into FILE (use - for stdout)")
27 |         parser.add_option("-t", "--output-format", metavar="FORMAT",
28 |                           help="format to use for dumping items with -o")
29 | 
30 |     def process_options(self, args, opts):
31 |         ScrapyCommand.process_options(self, args, opts)
32 |         try:
33 |             opts.spargs = arglist_to_dict(opts.spargs)
34 |         except ValueError:
35 |             raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
36 |  
37 |     def run(self, args, opts):
38 |         #settings = get_project_settings()
39 | 
40 |         spider_loader = self.crawler_process.spider_loader
41 |         for spidername in args or spider_loader.list():
42 |             print("*********cralall spidername************" + spidername)
43 |             self.crawler_process.crawl(spidername, **opts.spargs)
44 | 
45 |         self.crawler_process.start()    
46 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaiduBaikeItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     title_id = scrapy.Field()
16 |     abstract = scrapy.Field()
17 |     infobox = scrapy.Field()
18 |     subject = scrapy.Field()
19 |     disambi = scrapy.Field()
20 |     redirect = scrapy.Field()
21 |     curLink = scrapy.Field()
22 |     interPic = scrapy.Field()
23 |     interLink = scrapy.Field()
24 |     exterLink = scrapy.Field()
25 |     relateLemma = scrapy.Field()
26 |     all_text = scrapy.Field()
27 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | class WeixinSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | class RandomUserAgent:
 59 |     def __init__(self, agents):
 60 |         self.agents =[
 61 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 62 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 63 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 64 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 65 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 67 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 68 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 69 |             ]        
 70 |                      
 71 |     @classmethod     
 72 |     def from_crawler(cls,crawler):
 73 |         # 获取settings的USER_AGENT列表并返回
 74 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 75 |     def process_request(self, request, spider):
 76 |         # 随机设置Request报头header的User-Agent
 77 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 78 |                      
 79 | # 添加代理           
 80 |                      
 81 | class ProxyMiddleWare(object):
 82 |     proxy_list=[     
 83 |    "http://58.87.89.234:3128",
 84 |     "http://139.201.202.140:53281",
 85 |     "http://27.37.123.30:9000",
 86 |     "http://218.67.82.146:36709",
 87 |     "http://222.222.169.60:53281",
 88 |     "http://120.33.247.233:46884",
 89 |     "http://114.215.18.7:3128",
 90 |     "http://112.74.94.142:3128",
 91 |     "http://122.72.18.34:80",
 92 |     "http://36.33.25.123:808",
 93 |     "http://123.138.89.133:9999",
 94 |     "http://111.231.192.61:8080",
 95 |     "http://59.41.202.228:53281",
 96 |     "http://222.241.14.187:8888",
 97 |     "http://61.155.164.106:3128",
 98 |     "http://27.40.156.43:61234",
 99 |     "http://14.29.84.50:8080",
100 |     "http://116.25.100.62:9797",
101 |     "http://58.21.183.144:80",
102 |     "http://14.221.166.205:9000",
103 |     "http://115.231.50.10:53281",
104 |     "http://120.34.205.40:808",
105 |     "http://123.139.56.238:9999",
106 |     "http://113.116.170.232:9000",
107 |     "http://116.17.236.36:808",
108 |     "http://114.232.163.73:34837",
109 |     "http://171.35.103.37:808",
110 |     "http://27.46.51.232:9797",
111 |     "http://223.247.255.207:24714",
112 |     "http://223.241.117.179:8010",
113 |     "http://222.186.12.102:57624"]
114 | 
115 | def process_request(self,request,spider):
116 |     # if not request.meta['proxies']:
117 |     ip = random.choice(self.proxy_list)
118 |     request.meta['proxy'] = ip 
119 | 
120 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from __future__ import absolute_import
 8 | from __future__ import division     
 9 | from __future__ import print_function
10 | 
11 | 
12 | import pymysql
13 | from pymysql import connections
14 | from baidu_baike import settings
15 | 
16 | class BaiduBaikePipeline(object):
17 |     def __init__(self):
18 |         self.conn = pymysql.connect(
19 |             host=settings.HOST_IP,
20 |             port=settings.PORT,
21 |             user=settings.USER,
22 |             passwd=settings.PASSWD,
23 |             db=settings.DB_NAME,
24 |             charset='utf8mb4',
25 |             use_unicode=True
26 |             )   
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def process_item(self, item, spider):
30 |         # process info for actor
31 |         title = str(item['title']).encode('utf-8')
32 |         title_id = str(item['title_id']).encode('utf-8')
33 |         abstract = str(item['abstract']).encode('utf-8')
34 |         infobox = str(item['infobox']).encode('utf-8')
35 |         subject = str(item['subject']).encode('utf-8')
36 |         disambi = str(item['disambi']).encode('utf-8')
37 |         redirect = str(item['redirect']).encode('utf-8')
38 |         curLink = str(item['curLink']).encode('utf-8')
39 |         interPic = str(item['interPic']).encode('utf-8')
40 |         interLink = str(item['interLink']).encode('utf-8')
41 |         exterLink = str(item['exterLink']).encode('utf-8')
42 |         relateLemma = str(item['relateLemma']).encode('utf-8')
43 |         all_text = str(item['all_text']).encode('utf-8')
44 | 
45 | #        self.cursor.execute("SELECT disambi FROM lemmas;")
46 | #        disambi_list = self.cursor.fetchall()
47 | #        if (disambi,) not in disambi_list :
48 |         self.cursor.execute("SELECT MAX(title_id) FROM lemmas")
49 |         result = self.cursor.fetchall()[0]
50 |         if None in result:
51 |             title_id = 1
52 |         else:
53 |             title_id = result[0] + 1
54 |         sql = """
55 |         INSERT INTO lemmas(title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
56 |         """
57 |         try:
58 | #            disambi_list = self.cursor.fetchall()
59 | #            if (disambi, ) in disambi_list:
60 | #                print ("result: ", disambi)
61 |             self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
62 |             self.conn.commit()
63 | #            self.cursor.execute("SELECT disambi FROM lemmas" )
64 |         except Exception as e:
65 |             print("#"*20, "\nAn error when insert into mysql!!\n")
66 |             print("curLink: ", curLink, "\n")
67 |             print(e, "\n", "#"*20)
68 |             try:
69 |                 all_text = str('None').encode('utf-8').encode('utf-8')
70 |                 self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
71 |                 self.conn.commit()
72 |             except Exception as f:
73 |                 print("Error without all_text!!!")
74 |         return item
75 | 
76 |     def close_spider(self, spider):
77 |         self.conn.close()
78 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for baidu_baike project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'baidu_baike'
 13 | 
 14 | SPIDER_MODULES = ['baidu_baike.spiders']
 15 | NEWSPIDER_MODULE = 'baidu_baike.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | CONCURRENT_REQUESTS = 300
 26 | DOWNLOAD_TIMEOUT=30
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | #import random
 32 | #DOWNLOAD_DELAY = random.randint(0, 1)
 33 | # The download delay setting will honor only one of:
 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 100
 35 | CONCURRENT_REQUESTS_PER_IP = 100
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | #DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | #}
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 51 | #SPIDER_MIDDLEWARES = {
 52 | #    'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543,
 53 | #}
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 57 | #DOWNLOADER_MIDDLEWARES = {
 58 | #    'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543,
 59 | #}
 60 | DOWNLOADER_MIDDLEWARES = {
 61 |     'baidu_baike.middlewares.RandomUserAgent': 10,
 62 |     'baidu_baike.middlewares.ProxyMiddleWare': 100,
 63 | }
 64 | # Enable or disable extensions
 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 66 | #EXTENSIONS = {
 67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 68 | #}
 69 | 
 70 | # Configure item pipelines
 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 72 | #ITEM_PIPELINES = {
 73 | #    'baidu_baike.pipelines.BaiduBaikePipeline': 300,
 74 | #}
 75 | ITEM_PIPELINES = {     
 76 |     'baidu_baike.pipelines.BaiduBaikePipeline': 300,
 77 | }
 78 |  
 79 | HOST_IP = 'localhost'  
 80 | PORT = 3306
 81 | USER = 'root'
 82 | PASSWD = 'root'
 83 | DB_NAME = 'baidu_duplicate'
 84 | 
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 87 | #AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | #AUTOTHROTTLE_START_DELAY = 5
 90 | # The maximum download delay to be set in case of high latencies
 91 | #AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | #AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 
106 | COMMANDS_MODULE = 'baidu_baike.commands'
107 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(name='scrapy-mymodule',
 7 |   entry_points={
 8 |     'scrapy.commands': [
 9 |       'crawlall=baidu_baike.commands:crawlall',
10 |     ],
11 |   },
12 |  )
13 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu2'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/%E5%91%A8%E6%98%9F%E9%A9%B0/169917?fr=aladdin']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu3'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/%E4%B8%83%E5%B0%8F%E7%A6%8F']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu4'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/%E9%AB%98%E6%A3%98%E9%BE%99']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu5'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/剑龙']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-6.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu6'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/%E5%89%91%E9%BE%99/6817480#viewPageContent']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/baidu_baike/spiders/baidu_baike-8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division     
  6 | from __future__ import print_function
  7 | 
  8 | 
  9 | from baidu_baike.items import BaiduBaikeItem
 10 | from scrapy.utils.log import configure_logging
 11 | import scrapy
 12 | from scrapy.crawler import CrawlerRunner
 13 | from twisted.internet import reactor
 14 | from scrapy.http import Request
 15 | from bs4 import BeautifulSoup
 16 | import re
 17 | import urllib
 18 | import json
 19 | 
 20 | class BaiduBaikeSpider(scrapy.Spider, object):
 21 |     name = 'baidu8'
 22 |     allowed_domains = ["baike.baidu.com"]
 23 |     start_urls = ['https://baike.baidu.com/item/%E8%9E%BA%E6%97%8B%E8%8A%A6%E8%8D%9F']
 24 |     
 25 |     def _get_from_findall(self, tag_list):
 26 |         result = []        
 27 |         for slist in tag_list:
 28 |             tmp = slist.get_text()
 29 |             result.append(tmp)
 30 |         return result
 31 | 
 32 |     def parse(self, response):
 33 |         # tooooo ugly,,,, but can not use defaultdict
 34 |         item = BaiduBaikeItem()
 35 |         for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma']:
 36 |             item[sub_item] = None
 37 | 
 38 |         mainTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
 39 |         subTitle = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
 40 |         redirect_name = response.xpath("//span[@class='viewTip-fromTitle']/text()").extract()
 41 |         try:
 42 |             item['title'] = ' '.join(mainTitle)
 43 |         except:
 44 |             item['title'] = None
 45 |         try:
 46 |             item['disambi'] = ' '.join(mainTitle + subTitle)
 47 |         except:
 48 |             item['disambi'] = None
 49 |         try:
 50 |             item['redirect'] = ' '.join(redirect_name)
 51 |         except:
 52 |             item['redirect'] = None
 53 |         try:
 54 |             item['curLink'] = str(response.url)
 55 |         except:
 56 |             item['curLink'] = None
 57 | 
 58 |         soup = BeautifulSoup(response.text, 'lxml')
 59 |         summary_node = soup.find("div", class_ = "lemma-summary")
 60 |         try:
 61 |             item['abstract'] = summary_node.get_text().replace("\n"," ")
 62 |         except:
 63 |             item['abstract'] = None
 64 | 
 65 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
 66 |         page_category = [l.strip() for l in page_category]
 67 |         try:
 68 |             item['subject'] = ','.join(page_category)
 69 |         except:
 70 |             item['subject'] = None
 71 | 
 72 |         # Get infobox
 73 |         all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
 74 |         basic_item = self._get_from_findall(all_basicInfo_Item)
 75 |         basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
 76 |         all_basicInfo_value = soup.find_all("dd", class_ = "basicInfo-item value" )
 77 |         basic_value = self._get_from_findall(all_basicInfo_value)
 78 |         basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
 79 |         info_dict = {}
 80 |         for i, info in enumerate(basic_item):
 81 |             info_dict[info] = basic_value[i]
 82 |         try:
 83 |             item['infobox'] = json.dumps(info_dict)
 84 |         except:
 85 |             item['infobox'] = None
 86 |        
 87 |         # Get inter picture
 88 |         selector = scrapy.Selector(response)
 89 |         img_path = selector.xpath("//img[@class='picture']/@src").extract()
 90 |         try:
 91 |             item['interPic'] = ','.join(img_path)
 92 |         except:
 93 |             item['interPic'] = None
 94 | 
 95 |         inter_links_dict = {}
 96 |         soup = BeautifulSoup(response.text, 'lxml')
 97 |         inter_links = soup.find_all('a', href=re.compile(r"/item/"))
 98 |         for link in inter_links:
 99 |             new_url = link["href"]
100 |             url_name = link.get_text()
101 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
102 |             inter_links_dict[url_name] = new_full_url
103 |         try:
104 |             item['interLink'] = json.dumps(inter_links_dict)
105 |         except:
106 |             item['interLink'] = None
107 |         
108 |         exter_links_dict = {}
109 |         soup = BeautifulSoup(response.text, 'lxml')
110 |         exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
111 |         for link in exterLink_links:
112 |             new_url = link["href"]
113 |             url_name = link.get_text()
114 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
115 |             exter_links_dict[url_name] = new_full_url
116 |         try:
117 |             item['exterLink'] = json.dumps(exter_links_dict)
118 |         except:
119 |             item['exterLink'] = None
120 | 
121 |         all_para = soup.find_all('div',class_="para")
122 |         all_text = [para.get_text() for para in all_para]
123 |         try:
124 |             item['all_text'] = ' '.join(all_text)
125 |         except:
126 |             item['all_text'] = None
127 | 
128 |         yield item
129 | 
130 |         soup = BeautifulSoup(response.text, 'lxml')
131 |         links = soup.find_all('a', href=re.compile(r"/item/"))
132 |         for link in links:
133 |             new_url = link["href"]
134 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
135 |             yield scrapy.Request(new_full_url, callback=self.parse)
136 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/creat_mysql.md:
--------------------------------------------------------------------------------
 1 | DROP DATABASE baidu_duplicate;
 2 | 
 3 | CREATE DATABASE baidu_duplicate;
 4 | 
 5 | USE baidu_duplicate;
 6 | 
 7 | CREATE TABLE lemmas( title VARCHAR(100), title_id INT NOT NULL, abstract TEXT, infobox TEXT, subject VARCHAR(100), disambi VARCHAR(100), redirect VARCHAR(100), curLink TEXT, interPic TEXT, interLink TEXT, exterLink TEXT, relateLemma TEXT, all_text TEXT, PRIMARY KEY(title_id));
 8 | 
 9 | ALTER TABLE lemmas CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
10 | 
11 | ALTER table lemmas ADD INDEX title_index(title);
12 | 
13 | ALTER table lemmas ADD INDEX subject_index(subject);
14 | 
15 | ALTER table lemmas ADD INDEX disambi_index(disambi);
16 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_baidu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = baidu_baike.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baidu_baike
12 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_hudong/craw_all_hudong/__init__.py


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/craw_all_hudong/craw_all_hudong/commands/__init__.py


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from scrapy.commands import ScrapyCommand
 5 | from scrapy.crawler import CrawlerRunner
 6 | from scrapy.exceptions import UsageError
 7 | from scrapy.utils.project import get_project_settings
 8 | from scrapy.crawler import Crawler
 9 | from scrapy.utils.conf import arglist_to_dict
10 |  
11 | class Command(ScrapyCommand):
12 |  
13 |     requires_project = True
14 |  
15 |     def syntax(self):
16 |         return '[options]'
17 |  
18 |     def short_desc(self):
19 |         return 'Runs all of the spiders'
20 | 
21 |     def add_options(self, parser):
22 |         ScrapyCommand.add_options(self, parser)
23 |         parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
24 |                           help="set spider argument (may be repeated)")
25 |         parser.add_option("-o", "--output", metavar="FILE",
26 |                           help="dump scraped items into FILE (use - for stdout)")
27 |         parser.add_option("-t", "--output-format", metavar="FORMAT",
28 |                           help="format to use for dumping items with -o")
29 | 
30 |     def process_options(self, args, opts):
31 |         ScrapyCommand.process_options(self, args, opts)
32 |         try:
33 |             opts.spargs = arglist_to_dict(opts.spargs)
34 |         except ValueError:
35 |             raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
36 |  
37 |     def run(self, args, opts):
38 |         #settings = get_project_settings()
39 | 
40 |         spider_loader = self.crawler_process.spider_loader
41 |         for spidername in args or spider_loader.list():
42 |             print("*********cralall spidername************" + spidername)
43 |             self.crawler_process.crawl(spidername, **opts.spargs)
44 | 
45 |         self.crawler_process.start()    
46 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawAllHudongItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     title_id = scrapy.Field()
16 |     abstract = scrapy.Field()
17 |     infobox = scrapy.Field()
18 |     subject = scrapy.Field()
19 |     disambi = scrapy.Field()
20 |     redirect = scrapy.Field()
21 |     curLink = scrapy.Field()
22 |     interPic = scrapy.Field()
23 |     interLink = scrapy.Field()
24 |     exterLink = scrapy.Field()
25 |     relateLemma = scrapy.Field()
26 |     all_text = scrapy.Field()
27 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | class HuDongSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | class RandomUserAgent:
 59 |     def __init__(self, agents):
 60 |         self.agents =[
 61 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 62 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 63 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 64 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 65 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 67 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 68 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 69 |             ]        
 70 |                      
 71 |     @classmethod     
 72 |     def from_crawler(cls,crawler):
 73 |         # 获取settings的USER_AGENT列表并返回
 74 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 75 |     def process_request(self, request, spider):
 76 |         # 随机设置Request报头header的User-Agent
 77 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 78 |                      
 79 | # 添加代理           
 80 |                      
 81 | class ProxyMiddleWare(object):
 82 |     proxy_list=[     
 83 |    "http://58.87.89.234:3128",
 84 |     "http://139.201.202.140:53281",
 85 |     "http://27.37.123.30:9000",
 86 |     "http://218.67.82.146:36709",
 87 |     "http://222.222.169.60:53281",
 88 |     "http://120.33.247.233:46884",
 89 |     "http://114.215.18.7:3128",
 90 |     "http://112.74.94.142:3128",
 91 |     "http://122.72.18.34:80",
 92 |     "http://36.33.25.123:808",
 93 |     "http://123.138.89.133:9999",
 94 |     "http://111.231.192.61:8080",
 95 |     "http://59.41.202.228:53281",
 96 |     "http://222.241.14.187:8888",
 97 |     "http://61.155.164.106:3128",
 98 |     "http://27.40.156.43:61234",
 99 |     "http://14.29.84.50:8080",
100 |     "http://116.25.100.62:9797",
101 |     "http://58.21.183.144:80",
102 |     "http://14.221.166.205:9000",
103 |     "http://115.231.50.10:53281",
104 |     "http://120.34.205.40:808",
105 |     "http://123.139.56.238:9999",
106 |     "http://113.116.170.232:9000",
107 |     "http://116.17.236.36:808",
108 |     "http://114.232.163.73:34837",
109 |     "http://171.35.103.37:808",
110 |     "http://27.46.51.232:9797",
111 |     "http://223.247.255.207:24714",
112 |     "http://223.241.117.179:8010",
113 |     "http://222.186.12.102:57624"]
114 | 
115 | def process_request(self,request,spider):
116 |     # if not request.meta['proxies']:
117 |     ip = random.choice(self.proxy_list)
118 |     request.meta['proxy'] = ip 
119 | 
120 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from __future__ import absolute_import
 8 | from __future__ import division     
 9 | from __future__ import print_function
10 | 
11 | 
12 | import pymysql
13 | from pymysql import connections
14 | from craw_all_hudong import settings
15 | 
16 | class CrawAllHudongPipeline(object):
17 |     def __init__(self):
18 |         self.conn = pymysql.connect(
19 |             host=settings.HOST_IP,
20 |             port=settings.PORT,
21 |             user=settings.USER,
22 |             passwd=settings.PASSWD,
23 |             db=settings.DB_NAME,
24 |             charset='utf8mb4',
25 |             use_unicode=True
26 |             )   
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def process_item(self, item, spider):
30 |         # process info for actor
31 |         title = str(item['title']).encode('utf-8')
32 |         title_id = str(item['title_id']).encode('utf-8')
33 |         abstract = str(item['abstract']).encode('utf-8')
34 |         infobox = str(item['infobox']).encode('utf-8')
35 |         subject = str(item['subject']).encode('utf-8')
36 |         disambi = str(item['disambi']).encode('utf-8')
37 |         redirect = str(item['redirect']).encode('utf-8')
38 |         curLink = str(item['curLink']).encode('utf-8')
39 |         interPic = str(item['interPic']).encode('utf-8')
40 |         interLink = str(item['interLink']).encode('utf-8')
41 |         exterLink = str(item['exterLink']).encode('utf-8')
42 |         relateLemma = str(item['relateLemma']).encode('utf-8')
43 |         all_text = str(item['all_text']).encode('utf-8')
44 | 
45 | #        self.cursor.execute("SELECT disambi FROM lemmas;")
46 | #        disambi_list = self.cursor.fetchall()
47 | #        if (disambi,) not in disambi_list :
48 |         self.cursor.execute("SELECT MAX(title_id) FROM lemmas")
49 |         result = self.cursor.fetchall()[0]
50 |         if None in result:
51 |             title_id = 1
52 |         else:
53 |             title_id = result[0] + 1
54 |         sql = """
55 |         INSERT INTO lemmas(title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
56 |         """
57 |         try:
58 | #            disambi_list = self.cursor.fetchall()
59 | #            if (disambi, ) in disambi_list:
60 | #                print ("result: ", disambi)
61 |             self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
62 |             self.conn.commit()
63 | #            self.cursor.execute("SELECT disambi FROM lemmas" )
64 |         except Exception as e:
65 |             print("#"*20, "\nAn error when insert into mysql!!\n")
66 |             print("curLink: ", curLink, "\n")
67 |             print(e, "\n", "#"*20)
68 |             try:
69 |                 all_text = str('None').encode('utf-8').encode('utf-8')
70 |                 self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
71 |                 self.conn.commit()
72 |             except Exception as f:
73 |                 print("Error beyond all_text!!!")
74 |         return item
75 | 
76 |     def close_spider(self, spider):
77 |         self.conn.close()
78 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for craw_all_hudong project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'craw_all_hudong'
 13 | 
 14 | SPIDER_MODULES = ['craw_all_hudong.spiders']
 15 | NEWSPIDER_MODULE = 'craw_all_hudong.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'craw_all_hudong (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | CONCURRENT_REQUESTS = 64
 26 | DOWNLOAD_TIMEOUT=30
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | #import random
 32 | #DOWNLOAD_DELAY = random.randint(0, 1)
 33 | # The download delay setting will honor only one of:
 34 | CONCURRENT_REQUESTS_PER_DOMAIN = 30
 35 | CONCURRENT_REQUESTS_PER_IP = 30
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | #DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | #}
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 51 | #SPIDER_MIDDLEWARES = {
 52 | #    'craw_all_hudong.middlewares.hudongBaikeSpiderMiddleware': 543,
 53 | #}
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 57 | #DOWNLOADER_MIDDLEWARES = {
 58 | #    'craw_all_hudong.middlewares.hudongBaikeDownloaderMiddleware': 543,
 59 | #}
 60 | DOWNLOADER_MIDDLEWARES = {
 61 |     'craw_all_hudong.middlewares.RandomUserAgent': 10,
 62 |     'craw_all_hudong.middlewares.ProxyMiddleWare': 100,
 63 | }
 64 | # Enable or disable extensions
 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 66 | #EXTENSIONS = {
 67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 68 | #}
 69 | 
 70 | # Configure item pipelines
 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 72 | #ITEM_PIPELINES = {
 73 | #    'craw_all_hudong.pipelines.hudongBaikePipeline': 300,
 74 | #}
 75 | ITEM_PIPELINES = {     
 76 |     'craw_all_hudong.pipelines.CrawAllHudongPipeline': 300,
 77 | }
 78 |  
 79 | HOST_IP = 'localhost'  
 80 | PORT = 3306
 81 | USER = 'root'
 82 | PASSWD = 'root'
 83 | DB_NAME = 'hudong_fenlei'
 84 | 
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 87 | #AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | #AUTOTHROTTLE_START_DELAY = 5
 90 | # The maximum download delay to be set in case of high latencies
 91 | #AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | #AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 
106 | COMMANDS_MODULE = 'craw_all_hudong.commands'
107 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(name='scrapy-mymodule',
 7 |   entry_points={
 8 |     'scrapy.commands': [
 9 |       'crawlall=craw_all_hudong.commands:crawlall',
10 |     ],
11 |   },
12 |  )
13 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/craw_all_hudong/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/creat_mysql.md:
--------------------------------------------------------------------------------
 1 | #DROP DATABASE hudong_duplicate;
 2 | 
 3 | CREATE DATABASE hudong_fenlei;
 4 | 
 5 | USE hudong_fenlei;
 6 | 
 7 | CREATE TABLE lemmas( title VARCHAR(100), title_id INT NOT NULL, abstract TEXT, infobox TEXT, subject VARCHAR(500), disambi VARCHAR(100), redirect VARCHAR(100), curLink TEXT, interPic TEXT, interLink TEXT, exterLink TEXT, relateLemma TEXT, all_text TEXT, PRIMARY KEY(title_id));
 8 | 
 9 | ALTER TABLE lemmas CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
10 | 
11 | ALTER table lemmas ADD INDEX title_index(title);
12 | 
13 | #ALTER table lemmas ADD INDEX subject_index(subject);
14 | 
15 | ALTER table lemmas ADD INDEX disambi_index(disambi);
16 | 


--------------------------------------------------------------------------------
/ie/craw/craw_all_hudong/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = craw_all_hudong.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = craw_all_hudong
12 | 


--------------------------------------------------------------------------------
/ie/craw/craw_without_spider/mysql/creat_sql.txt:
--------------------------------------------------------------------------------
 1 | # Commands to creat mysql database and tables. This database includes actors and films. 
 2 | 
 3 | # 演员 : ID, 简介， 中文名，外文名，国籍，星座，出生地，出生日期，代表作品，主要成就，经纪公司；
 4 | # actor: actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage;
 5 | 
 6 | # 电影作品：ID，简介，中文名，外文名，出品时间，出品公司，导演，编剧，类型，主演，片长，上映时间，对白语言，主要成就；
 7 | # movie: movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem;
 8 | 
 9 | # 电影类型：爱情，喜剧，动作，剧情，科幻，恐怖，动画，惊悚，犯罪，冒险，其他；
10 | # genre: genre_id, genre_name
11 | 
12 | # 演员->电影： 演员ID， 电影ID；
13 | # actor_to_movie: actor_id, movie_id;
14 | 
15 | # 电影-> 类型： 电影ID， 类型ID
16 | # movie_to_genre: movie_id, genre_id
17 | 
18 | CREATE DATABASE kg_movie;
19 | USE kg_movie;
20 | 
21 | CREATE TABLE actor( actor_id INT NOT NULL,  actor_bio TEXT, actor_chName VARCHAR(100), actor_foreName VARCHAR(100), actor_nationality VARCHAR(100), actor_constellation VARCHAR(100), actor_birthPlace VARCHAR(100), actor_birthDay VARCHAR(100), actor_repWorks VARCHAR(100), actor_achiem TEXT, actor_brokerage VARCHAR(100), PRIMARY KEY(actor_id) );
22 | 
23 | CREATE TABLE movie( movie_id INT NOT NULL, movie_bio TEXT, movie_chName VARCHAR(100), movie_foreName VARCHAR(100), movie_prodTime VARCHAR(100),  movie_prodCompany VARCHAR(100), movie_director VARCHAR(100), movie_screenwriter VARCHAR(100), movie_genre VARCHAR(100), movie_star VARCHAR(100), movie_length VARCHAR(100), movie_rekeaseTime VARCHAR(100), movie_language VARCHAR(100), movie_achiem TEXT, PRIMARY KEY(movie_id) );
24 | 
25 | CREATE TABLE actor_to_movie( actor_movie_id INT NOT NULL, actor_id INT NOT NULL, movie_id INT NOT NULL, PRIMARY KEY(actor_movie_id) );
26 | 
27 | CREATE TABLE genre ( genre_id INT NOT NULL, genre_name VARCHAR(100), PRIMARY KEY(genre_id)  );
28 | # Set char Set
29 | ALTER TABLE actor CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
30 | ALTER TABLE movie CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
31 | ALTER TABLE genre CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
32 | 
33 | INSERT INTO genre (genre_id, genre_name) VALUES (0, '爱情'), (1, '喜剧'), (2, '动作'), (3, '剧情'), (4, '科幻'), (5, '恐怖'), (6, '动画'), (7, '惊悚'), (8, '犯罪'), (9, '冒险'), (10, '其他');
34 | 
35 | CREATE TABLE movie_to_genre( movie_genre_id INT NOT NULL, movie_id INT NOT NULL, genre_id INT NOT NULL, PRIMARY KEY(movie_genre_id) );
36 | 


--------------------------------------------------------------------------------
/ie/craw/craw_without_spider/mysql/help_mysql.txt:
--------------------------------------------------------------------------------
 1 | # 修改mysql 中默认字符集到utf8
 2 | 
 3 | ALTER TABLE table_name CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;
 4 | 
 5 | 查看自己的字符集
 6 | 
 7 | SHOW FULL COLUMNS FROM table_name;
 8 | 
 9 | 更改表中某一属性的类型：把actor_achiem 变为TEXT
10 | 
11 | ALTER TABLE actor CHANGE actor_achiem actor_achiem TEXT;
12 | 


--------------------------------------------------------------------------------
/ie/craw/craw_without_spider/utils/basic_info.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | '''
 5 | 包含各个表的属性定义等和程序逻辑无关的部分
 6 | '''
 7 | 
 8 | insert_actor_command = 'INSERT INTO actor (actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '
 9 | insert_movie_command = 'INSERT INTO movie (movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director,    movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) '
10 | insert_actor_movie_command = 'INSERT INTO actor_to_movie (actor_movie_id, actor_id, movie_id ) VALUES (%s, %s, %s ) '
11 | insert_movie_genre_command = 'INSERT INTO movie_to_genre (movie_genre_id, movie_id, genre_id ) VALUES (%s, %s, %s ) ' # id 是整数，pymysql不支持%i %d这种，都用%s
12 | 
13 | search_actor_id = 'SELECT actor_id FROM actor WHERE actor_chName= "%s" '
14 | search_movie_id = 'SELECT movie_id FROM movie WHERE movie_chName= "%s" '
15 | 
16 | get_largest_amid = 'SELECT max(actor_movie_id) FROM actor_to_movie '
17 | get_largest_mgid = 'SELECT max(movie_genre_id) FROM movie_to_genre '
18 | 
19 | actor_attr = {        
20 |     u'id' : int, 
21 |     u'简介': None,
22 |     u'中文名': None,
23 |     u'外文名': None,
24 |     u'国籍': None,
25 |     u'星座': None,
26 |     u'出生地': None,
27 |     u'出生日期': None,
28 |     u'代表作品': None,
29 |     u'主要成就' : None,
30 |     u'经纪公司': None
31 |        }
32 | actor_info = [u'id', u'简介',  u'中文名', u'外文名', u'国籍', u'星座', u'出生地', u'出生日期', u'代表作品', u'主要成就', u'经纪公司']
33 | 
34 | 
35 | movie_attr = {
36 |     u'id' : int, 
37 |     u'简介': None,
38 |     u'中文名': None,
39 |     u'外文名': None,
40 |     u'出品时间': None,
41 |     u'出品公司': None,
42 |     u'导演': None,
43 |     u'编剧': None,
44 |     u'类型': None,
45 |     u'主演' : None,
46 |     u'片长': None,
47 |     u'上映时间': None,
48 |     u'对白语言': None,
49 |     u'主要成就': None
50 |        }
51 | movie_info = [u'id', u'简介',  u'中文名', u'外文名', u'出品时间', u'出品公司', u'导演',  u'编剧', u'类型', u'主演', u'片长', u'上映时间', u'对白语言', u'主要成就' ]
52 | 
53 | movie_genre = {
54 |     u'爱情': 0,
55 |     u'喜剧': 1,
56 |     u'动作': 2,
57 |     u'剧情': 3,
58 |     u'科幻': 4,
59 |     u'恐怖': 5,
60 |     u'动画': 6,
61 |     u'惊悚': 7,
62 |     u'犯罪': 8,
63 |     u'冒险': 9,
64 |     u'其他': 10
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/hudong_baike/hudong_baike/__init__.py


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class HudongBaikeItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # Actor
15 |     actor_id = scrapy.Field()
16 |     actor_bio = scrapy.Field()
17 |     actor_chName = scrapy.Field()
18 |     actor_foreName = scrapy.Field()
19 |     actor_nationality = scrapy.Field()
20 |     actor_constellation = scrapy.Field()
21 |     actor_birthPlace = scrapy.Field()
22 |     actor_birthDay = scrapy.Field()
23 |     actor_repWorks = scrapy.Field()
24 |     actor_achiem = scrapy.Field()
25 |     actor_brokerage = scrapy.Field()
26 | 
27 |     # movie
28 | 
29 |     movie_id = scrapy.Field()
30 |     movie_bio = scrapy.Field()
31 |     movie_chName = scrapy.Field()
32 |     movie_foreName = scrapy.Field()
33 |     movie_prodTime = scrapy.Field()
34 |     movie_prodCompany = scrapy.Field()
35 |     movie_director = scrapy.Field()
36 |     movie_screenwriter = scrapy.Field()
37 |     movie_genre = scrapy.Field()
38 |     movie_star = scrapy.Field()
39 |     movie_length = scrapy.Field()
40 |     movie_rekeaseTime = scrapy.Field()
41 |     movie_language = scrapy.Field()
42 |     movie_achiem = scrapy.Field()
43 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | class WeixinSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | class RandomUserAgent:
 59 |     def __init__(self, agents):
 60 |         self.agents =[
 61 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 62 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 63 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 64 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 65 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 67 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 68 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 69 |             ]        
 70 |                      
 71 |     @classmethod     
 72 |     def from_crawler(cls,crawler):
 73 |         # 获取settings的USER_AGENT列表并返回
 74 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 75 |     def process_request(self, request, spider):
 76 |         # 随机设置Request报头header的User-Agent
 77 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 78 |                      
 79 | # 添加代理           
 80 |                      
 81 | class ProxyMiddleWare(object):
 82 |     proxy_list=[     
 83 |    "http://58.87.89.234:3128",
 84 |     "http://139.201.202.140:53281",
 85 |     "http://27.37.123.30:9000",
 86 |     "http://218.67.82.146:36709",
 87 |     "http://222.222.169.60:53281",
 88 |     "http://120.33.247.233:46884",
 89 |     "http://114.215.18.7:3128",
 90 |     "http://112.74.94.142:3128",
 91 |     "http://122.72.18.34:80",
 92 |     "http://36.33.25.123:808",
 93 |     "http://123.138.89.133:9999",
 94 |     "http://111.231.192.61:8080",
 95 |     "http://59.41.202.228:53281",
 96 |     "http://222.241.14.187:8888",
 97 |     "http://61.155.164.106:3128",
 98 |     "http://27.40.156.43:61234",
 99 |     "http://14.29.84.50:8080",
100 |     "http://116.25.100.62:9797",
101 |     "http://58.21.183.144:80",
102 |     "http://14.221.166.205:9000",
103 |     "http://115.231.50.10:53281",
104 |     "http://120.34.205.40:808",
105 |     "http://123.139.56.238:9999",
106 |     "http://113.116.170.232:9000",
107 |     "http://116.17.236.36:808",
108 |     "http://114.232.163.73:34837",
109 |     "http://171.35.103.37:808",
110 |     "http://27.46.51.232:9797",
111 |     "http://223.247.255.207:24714",
112 |     "http://223.241.117.179:8010",
113 |     "http://222.186.12.102:57624"]
114 | 
115 | def process_request(self,request,spider):
116 |     # if not request.meta['proxies']:
117 |     ip = random.choice(self.proxy_list)
118 |     request.meta['proxy'] = ip 
119 | 
120 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import pymysql
 12 | from hudong_baike import settings
 13 | 
 14 | 
 15 | class HudongBaikePipeline(object):
 16 |     def __init__(self):
 17 |         self.conn = pymysql.connect(
 18 |             host=settings.HOST_IP,
 19 |             #            port=settings.PORT,
 20 |             user=settings.USER,
 21 |             passwd=settings.PASSWD,
 22 |             db=settings.DB_NAME,
 23 |             charset='utf8mb4',
 24 |             use_unicode=True
 25 |         )
 26 |         self.cursor = self.conn.cursor()
 27 | 
 28 |     def process_item(self, item, spider):
 29 |         # process info for actor
 30 |         actor_chName = str(item['actor_chName']).encode('utf-8')
 31 |         actor_foreName = str(item['actor_foreName']).encode('utf-8')
 32 |         movie_chName = str(item['movie_chName']).encode('utf-8')
 33 |         movie_foreName = str(item['movie_foreName']).encode('utf-8')
 34 | 
 35 |         if (item['actor_chName'] != None or item['actor_foreName'] != None) and item['movie_chName'] == None:
 36 |             actor_bio = str(item['actor_bio']).encode('utf-8')
 37 |             actor_nationality = str(item['actor_nationality']).encode('utf-8')
 38 |             actor_constellation = str(item['actor_constellation']).encode('utf-8')
 39 |             actor_birthPlace = str(item['actor_birthPlace']).encode('utf-8')
 40 |             actor_birthDay = str(item['actor_birthDay']).encode('utf-8')
 41 |             actor_repWorks = str(item['actor_repWorks']).encode('utf-8')
 42 |             actor_achiem = str(item['actor_achiem']).encode('utf-8')
 43 |             actor_brokerage = str(item['actor_brokerage']).encode('utf-8')
 44 | 
 45 |             self.cursor.execute("SELECT actor_chName FROM actor;")
 46 |             actorList = self.cursor.fetchall()
 47 |             if (actor_chName,) not in actorList:
 48 |                 # get the nums of actor_id in table actor
 49 |                 self.cursor.execute("SELECT MAX(actor_id) FROM actor")
 50 |                 result = self.cursor.fetchall()[0]
 51 |                 if None in result:
 52 |                     actor_id = 1
 53 |                 else:
 54 |                     actor_id = result[0] + 1
 55 |                 sql = """
 56 |                 INSERT INTO actor(actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 57 |                 """
 58 |                 self.cursor.execute(sql, (
 59 |                 actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation,
 60 |                 actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage))
 61 |                 self.conn.commit()
 62 |             else:
 63 |                 print("#" * 20, "Got a duplict actor!!", actor_chName)
 64 |         elif (item['movie_chName'] != None or item['movie_foreName'] != None) and item['actor_chName'] == None:
 65 |             movie_bio = str(item['movie_bio']).encode('utf-8')
 66 |             movie_prodTime = str(item['movie_prodTime']).encode('utf-8')
 67 |             movie_prodCompany = str(item['movie_prodCompany']).encode('utf-8')
 68 |             movie_director = str(item['movie_director']).encode('utf-8')
 69 |             movie_screenwriter = str(item['movie_screenwriter']).encode('utf-8')
 70 |             movie_genre = str(item['movie_genre']).encode('utf-8')
 71 |             movie_star = str(item['movie_star']).encode('utf-8')
 72 |             movie_length = str(item['movie_length']).encode('utf-8')
 73 |             movie_rekeaseTime = str(item['movie_rekeaseTime']).encode('utf-8')
 74 |             movie_language = str(item['movie_language']).encode('utf-8')
 75 |             movie_achiem = str(item['movie_achiem']).encode('utf-8')
 76 | 
 77 |             self.cursor.execute("SELECT movie_chName FROM movie;")
 78 |             movieList = self.cursor.fetchall()
 79 |             if (movie_chName,) not in movieList:
 80 |                 self.cursor.execute("SELECT MAX(movie_id) FROM movie")
 81 |                 result = self.cursor.fetchall()[0]
 82 |                 if None in result:
 83 |                     movie_id = 1
 84 |                 else:
 85 |                     movie_id = result[0] + 1
 86 |                 sql = """
 87 |                 INSERT INTO movie(  movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 88 |                 """
 89 |                 self.cursor.execute(sql, (
 90 |                 movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany, movie_director,
 91 |                 movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language,
 92 |                 movie_achiem))
 93 |                 self.conn.commit()
 94 |             else:
 95 |                 print("Got a duplict movie!!", movie_chName)
 96 |         else:
 97 |             print("Skip this page because wrong category!! ")
 98 |         return item
 99 | 
100 |     def close_spider(self, spider):
101 |         self.conn.close()
102 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for hudong_baike project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'hudong_baike'
 13 | 
 14 | SPIDER_MODULES = ['hudong_baike.spiders']
 15 | NEWSPIDER_MODULE = 'hudong_baike.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'hudong_baike (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | import random
 32 | DOWNLOAD_DELAY = random.randint(0, 1)
 33 | # The download delay setting will honor only one of:
 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 35 | #CONCURRENT_REQUESTS_PER_IP = 16
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | #COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | #DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | #}
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 51 | #SPIDER_MIDDLEWARES = {
 52 | #    'hudong_baike.middlewares.HudongBaikeSpiderMiddleware': 543,
 53 | #}
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 57 | #DOWNLOADER_MIDDLEWARES = {
 58 | #    'hudong_baike.middlewares.HudongBaikeDownloaderMiddleware': 543,
 59 | #}
 60 | DOWNLOADER_MIDDLEWARES = {
 61 |     'hudong_baike.middlewares.RandomUserAgent': 10,
 62 |     'hudong_baike.middlewares.ProxyMiddleWare': 100,
 63 | }
 64 | # Enable or disable extensions
 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 66 | #EXTENSIONS = {
 67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 68 | #}
 69 | 
 70 | # Configure item pipelines
 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 72 | #ITEM_PIPELINES = {
 73 | #    'hudong_baike.pipelines.HudongBaikePipeline': 300,
 74 | #}
 75 | ITEM_PIPELINES = {     
 76 |     'hudong_baike.pipelines.HudongBaikePipeline': 300,
 77 | }
 78 |  
 79 | HOST_IP = 'localhost'  
 80 | PORT = '3306'
 81 | USER = 'root'
 82 | PASSWD = 'root'
 83 | DB_NAME = 'kg_movie'
 84 | 
 85 | # Enable and configure the AutoThrottle extension (disabled by default)
 86 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 87 | #AUTOTHROTTLE_ENABLED = True
 88 | # The initial download delay
 89 | #AUTOTHROTTLE_START_DELAY = 5
 90 | # The maximum download delay to be set in case of high latencies
 91 | #AUTOTHROTTLE_MAX_DELAY = 60
 92 | # The average number of requests Scrapy should be sending in parallel to
 93 | # each remote server
 94 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 95 | # Enable showing throttling stats for every response received:
 96 | #AUTOTHROTTLE_DEBUG = False
 97 | 
 98 | # Enable and configure HTTP caching (disabled by default)
 99 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100 | #HTTPCACHE_ENABLED = True
101 | #HTTPCACHE_EXPIRATION_SECS = 0
102 | #HTTPCACHE_DIR = 'httpcache'
103 | #HTTPCACHE_IGNORE_HTTP_CODES = []
104 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/hudong_baike/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/craw/hudong_baike/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = hudong_baike.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = hudong_baike
12 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news/__init__:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/news_spider/news/__init__


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/news_spider/news_spider/__init__.py


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class NewsSpiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()     # 标题
15 |     descr = scrapy.Field()     # 简述
16 |     auth = scrapy.Field()      # 作者
17 |     post_time = scrapy.Field() # 发布时间
18 |     main_news = scrapy.Field() # 新闻内容
19 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class NewsSpiderSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class NewsSpiderPipeline(object):
10 |     
11 |     def process_item(self, item, spider):
12 |         self.news = open("./news/" + item["title"].strip()+ item["post_time"] + ".txt", "w")
13 |         self.news.write(item["title"].encode("utf-8") + "\n")
14 |         self.news.write(item["auth"].encode("utf-8") + "\n")
15 |         self.news.write(item["post_time"].encode("utf-8") + "\n")
16 |         self.news.write(item["descr"].encode("utf-8") + "\n")
17 |         self.news.write(item["main_news"].encode("utf-8") + "\n")
18 | 
19 |         return item
20 | 
21 | 
22 |     def spider_closed(self):
23 |         self.news.close()
24 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for news_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'news_spider'
13 | 
14 | SPIDER_MODULES = ['news_spider.spiders']
15 | NEWSPIDER_MODULE = 'news_spider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'news_spider (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'news_spider.middlewares.NewsSpiderSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'news_spider.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'news_spider.pipelines.NewsSpiderPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/news_spider/spiders/huxiu_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import scrapy
 5 | from news_spider.items import NewsSpiderItem
 6 | 
 7 | class HuxiuSpider(scrapy.Spider):
 8 |     name = "huxiu"
 9 |     allowed_domains = ["huxiu.com"]
10 |     start_urls = ["http://www.huxiu.com"]
11 | 
12 | 
13 |     def parse(self, response):
14 | 
15 |         print "Start............................"
16 |         self.desc = ''
17 |         for sel in response.xpath('//div[@class="mod-b mod-art clearfix "]'):
18 |             item = NewsSpiderItem()
19 |             item['title'] = sel.xpath('./div/h2/a[@class="transition msubstr-row2"]/text()')[0].extract()
20 |             self.desc =  sel.xpath('./div[@class="mob-ctt index-article-list-yh"]/div[@class="mob-sub"]/text()')[0].extract()
21 |             link = sel.xpath('./div/h2/a/@href')[0].extract()
22 |             url = response.urljoin(link)
23 | 
24 |             yield scrapy.Request(url, callback=self.parse_article )
25 | 
26 |     def parse_article(self, response):
27 |         detail = response.xpath('//div[@class="article-wrap"]')
28 |         item = NewsSpiderItem()
29 |         item['title'] =  detail.xpath('./h1[@class="t-h1"]/text()')[0].extract()
30 |         item['auth'] = u"作者：" + detail.xpath('./div/span[@class="author-name"]/a/text()')[0].extract()
31 |         item['post_time'] = u"发表时间：" + detail.xpath('./div/div[@class="column-link-box"]/span[@class="article-time pull-left"]/text()')[0].extract()
32 |         item['descr'] = u"简述：" + self.desc + "\n"  # 简述存在错误
33 |         all_pars = detail.xpath('//div[@class="article-content-wrap"]//p/text()').extract()
34 |     
35 |         content = ''
36 |         for par in all_pars:
37 |             content = content + par + "\n"
38 | 
39 |         desc = item.get('main_news')
40 |         if desc == None:
41 |             item['main_news'] = content
42 |         else:
43 |             item['main_news'] = desc + content
44 | 
45 |         yield item
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/readme.md:
--------------------------------------------------------------------------------
1 | 虎嗅网爬虫
2 | 
3 | 对于加载更多的内容部分爬取还没有完成
4 | 


--------------------------------------------------------------------------------
/ie/craw/news_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = news_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = news_spider
12 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weixin_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weixin_spider
12 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/craw/weixin_spider/weixin_spider/__init__.py


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeixinSpiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()       # 文章标题
15 |     publishTime = scrapy.Field() # 发布时间
16 |     publicName = scrapy.Field()  # 公众号名字
17 |     article = scrapy.Field()     # 文章内容
18 |     cite = scrapy.Field()        # 文章引用来源
19 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | class WeixinSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | class RandomUserAgent:
 59 |     def __init__(self, agents):
 60 |         self.agents =[
 61 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 62 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 63 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 64 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 65 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 67 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 68 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 69 |             ]        
 70 |                      
 71 |     @classmethod     
 72 |     def from_crawler(cls,crawler):
 73 |         # 获取settings的USER_AGENT列表并返回
 74 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 75 |     def process_request(self, request, spider):
 76 |         # 随机设置Request报头header的User-Agent
 77 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 78 |                      
 79 | # 添加代理           
 80 |                      
 81 | class ProxyMiddleWare(object):
 82 |     proxy_list=[     
 83 |    "http://58.87.89.234:3128",
 84 |     "http://139.201.202.140:53281",
 85 |     "http://27.37.123.30:9000",
 86 |     "http://218.67.82.146:36709",
 87 |     "http://222.222.169.60:53281",
 88 |     "http://120.33.247.233:46884",
 89 |     "http://114.215.18.7:3128",
 90 |     "http://112.74.94.142:3128",
 91 |     "http://122.72.18.34:80",
 92 |     "http://36.33.25.123:808",
 93 |     "http://123.138.89.133:9999",
 94 |     "http://111.231.192.61:8080",
 95 |     "http://59.41.202.228:53281",
 96 |     "http://222.241.14.187:8888",
 97 |     "http://61.155.164.106:3128",
 98 |     "http://27.40.156.43:61234",
 99 |     "http://14.29.84.50:8080",
100 |     "http://116.25.100.62:9797",
101 |     "http://58.21.183.144:80",
102 |     "http://14.221.166.205:9000",
103 |     "http://115.231.50.10:53281",
104 |     "http://120.34.205.40:808",
105 |     "http://123.139.56.238:9999",
106 |     "http://113.116.170.232:9000",
107 |     "http://116.17.236.36:808",
108 |     "http://114.232.163.73:34837",
109 |     "http://171.35.103.37:808",
110 |     "http://27.46.51.232:9797",
111 |     "http://223.247.255.207:24714",
112 |     "http://223.241.117.179:8010",
113 |     "http://222.186.12.102:57624"]
114 | 
115 | def process_request(self,request,spider):
116 |     # if not request.meta['proxies']:
117 |     ip = random.choice(self.proxy_list)
118 |     request.meta['proxy'] = ip 
119 | 
120 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymysql
 9 | from pymysql import connections
10 | from weixin_spider import  settings
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding('utf-8')
14 | 
15 | class WeixinSpiderPipeline(object):
16 |     def __init__(self):
17 |         self.conn = pymysql.connect(
18 |             host=settings.HOST_IP,
19 | #            port=settings.PORT,
20 |             user=settings.USER,
21 |             passwd=settings.PASSWD,
22 |             db=settings.DB_NAME,
23 |             charset='utf8mb4',
24 |             use_unicode=True
25 |             )
26 |         self.cursor = self.conn.cursor()
27 | 
28 |     def process_item(self, item, spider):
29 |         title = str(item['title']).decode('utf-8')
30 |         publishTime = str(item['publishTime']).decode('utf-8') 
31 |         article = str(item['article']).decode('utf-8') 
32 |         publicName = str(item['publicName']).decode('utf-8') 
33 |         cite = str(item['cite']).decode('utf-8') 
34 | 
35 |         # 查询数据库，获取当前存在的文章标题，防止重复存入，但查表浪费时间
36 |         self.cursor.execute("SELECT title FROM weixin_xiaoshuo;")
37 |         titleList = self.cursor.fetchall()
38 |         titleStr = ''.join(map(str, titleList))
39 | 
40 |         self.cursor.execute("SELECT publicName FROM weixin_xiaoshuo;")
41 |         nameList = self.cursor.fetchall()
42 |         nameStr = ''.join(map(str, nameList))
43 | 
44 |         if titleStr.find(title) == -1 and nameStr.find(publicName) == -1:
45 |             # 执行SQL插入语句
46 |             sql = """
47 |             INSERT INTO weixin_xiaoshuo( title, publishTime, article, publicName, cite) VALUES (%s, %s, %s, %s, %s)
48 |             """
49 |             self.cursor.execute(sql, (title, publishTime, article, publicName, cite))
50 |             self.conn.commit()
51 |         else:
52 |             print "该文章已经存在于数据库中：", title.encode('utf-8')
53 |         return item
54 | 
55 |     def close_spider(self, spider):
56 |         self.conn.close()
57 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for weixin_spider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'weixin_spider'
 13 | 
 14 | SPIDER_MODULES = ['weixin_spider.spiders']
 15 | NEWSPIDER_MODULE = 'weixin_spider.spiders'
 16 | 
 17 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 18 | 
 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 20 | #USER_AGENT = 'weixin_spider (+http://www.yourdomain.com)'
 21 | 
 22 | # Obey robots.txt rules
 23 | ROBOTSTXT_OBEY = False
 24 | 
 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 26 | #CONCURRENT_REQUESTS = 32
 27 | 
 28 | # Configure a delay for requests for the same website (default: 0)
 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 30 | # See also autothrottle settings and docs
 31 | #DOWNLOAD_DELAY = 3
 32 | import random
 33 | DOWNLOAD_DELAY = random.randint(2, 3)
 34 | # The download delay setting will honor only one of:
 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 36 | #CONCURRENT_REQUESTS_PER_IP = 16
 37 | 
 38 | # Disable cookies (enabled by default)
 39 | #COOKIES_ENABLED = False
 40 | 
 41 | # Disable Telnet Console (enabled by default)
 42 | #TELNETCONSOLE_ENABLED = False
 43 | 
 44 | # Override the default request headers:
 45 | #DEFAULT_REQUEST_HEADERS = {
 46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 47 | #   'Accept-Language': 'en',
 48 | #}
 49 | 
 50 | # Enable or disable spider middlewares
 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 52 | #SPIDER_MIDDLEWARES = {
 53 | #    'weixin_spider.middlewares.WeixinSpiderSpiderMiddleware': 543,
 54 | #}
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 | DOWNLOADER_MIDDLEWARES = {
 59 |     'weixin_spider.middlewares.RandomUserAgent': 10,
 60 |     'weixin_spider.middlewares.ProxyMiddleWare': 100,
 61 | }
 62 | 
 63 | # Enable or disable extensions
 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 65 | #EXTENSIONS = {
 66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 67 | #}
 68 | 
 69 | # Configure item pipelines
 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 71 | ITEM_PIPELINES = {
 72 |     'weixin_spider.pipelines.WeixinSpiderPipeline': 300,
 73 | }
 74 | 
 75 | HOST_IP = 'localhost'
 76 | PORT = '3306'
 77 | USER = 'root'
 78 | PASSWD = 'nlp'
 79 | DB_NAME = 'weixin_xiaoshuo'
 80 | 
 81 | # Enable and configure the AutoThrottle extension (disabled by default)
 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 83 | #AUTOTHROTTLE_ENABLED = True
 84 | # The initial download delay
 85 | #AUTOTHROTTLE_START_DELAY = 5
 86 | # The maximum download delay to be set in case of high latencies
 87 | #AUTOTHROTTLE_MAX_DELAY = 60
 88 | # The average number of requests Scrapy should be sending in parallel to
 89 | # each remote server
 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 91 | # Enable showing throttling stats for every response received:
 92 | #AUTOTHROTTLE_DEBUG = False
 93 | 
 94 | # Enable and configure HTTP caching (disabled by default)
 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 96 | #HTTPCACHE_ENABLED = True
 97 | #HTTPCACHE_EXPIRATION_SECS = 0
 98 | #HTTPCACHE_DIR = 'httpcache'
 99 | #HTTPCACHE_IGNORE_HTTP_CODES = []
100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
101 | 


--------------------------------------------------------------------------------
/ie/craw/weixin_spider/weixin_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/deepdive/db.url:
--------------------------------------------------------------------------------
1 | postgresql://localhost:5432/movie
2 | 


--------------------------------------------------------------------------------
/ie/deepdive/deepdive.conf:
--------------------------------------------------------------------------------
1 | deepdive.calibration.holdout_fraction:0.25
2 | deepdive.sampler.sampler_args: "-l 1000 -s 1 -i 1000 --alpha 0.01 --sample_evidence"
3 | 


--------------------------------------------------------------------------------
/ie/deepdive/input/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/input/__init__.py


--------------------------------------------------------------------------------
/ie/deepdive/start_posql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | /etc/init.d/postgresql start
3 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/udf/__init__.py


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/deepdive/udf/baidu_baike/baidu_baike/__init__.py


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaiduBaikeItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     article_id = scrapy.Field()
15 |     articles = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import random
 10 | 
 11 | class WeixinSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | class RandomUserAgent:
 59 |     def __init__(self, agents):
 60 |         self.agents =[
 61 |             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media      Center PC 6.0)",
 62 |             "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.   30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 63 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 64 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 65 |             "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 66 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 67 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 68 |             "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
 69 |             ]        
 70 |                      
 71 |     @classmethod     
 72 |     def from_crawler(cls,crawler):
 73 |         # 获取settings的USER_AGENT列表并返回
 74 |         return cls(crawler.settings.getlist('USER_AGENTS'))
 75 |     def process_request(self, request, spider):
 76 |         # 随机设置Request报头header的User-Agent
 77 |         request.headers.setdefault('User-Agent', random.choice(self.agents))
 78 |                      
 79 | # 添加代理           
 80 |                      
 81 | class ProxyMiddleWare(object):
 82 |     proxy_list=[     
 83 |    "http://58.87.89.234:3128",
 84 |     "http://139.201.202.140:53281",
 85 |     "http://27.37.123.30:9000",
 86 |     "http://218.67.82.146:36709",
 87 |     "http://222.222.169.60:53281",
 88 |     "http://120.33.247.233:46884",
 89 |     "http://114.215.18.7:3128",
 90 |     "http://112.74.94.142:3128",
 91 |     "http://122.72.18.34:80",
 92 |     "http://36.33.25.123:808",
 93 |     "http://123.138.89.133:9999",
 94 |     "http://111.231.192.61:8080",
 95 |     "http://59.41.202.228:53281",
 96 |     "http://222.241.14.187:8888",
 97 |     "http://61.155.164.106:3128",
 98 |     "http://27.40.156.43:61234",
 99 |     "http://14.29.84.50:8080",
100 |     "http://116.25.100.62:9797",
101 |     "http://58.21.183.144:80",
102 |     "http://14.221.166.205:9000",
103 |     "http://115.231.50.10:53281",
104 |     "http://120.34.205.40:808",
105 |     "http://123.139.56.238:9999",
106 |     "http://113.116.170.232:9000",
107 |     "http://116.17.236.36:808",
108 |     "http://114.232.163.73:34837",
109 |     "http://171.35.103.37:808",
110 |     "http://27.46.51.232:9797",
111 |     "http://223.247.255.207:24714",
112 |     "http://223.241.117.179:8010",
113 |     "http://222.186.12.102:57624"]
114 | 
115 | def process_request(self,request,spider):
116 |     # if not request.meta['proxies']:
117 |     ip = random.choice(self.proxy_list)
118 |     request.meta['proxy'] = ip 
119 | 
120 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | import pymysql
12 | from pymysql import connections
13 | from baidu_baike import settings
14 | 
15 | 
16 | class BaiduBaikePipeline(object):
17 |     def __init__(self):
18 |         self.article_file = open("articles.txt", "a+",encoding='utf-8')
19 | 
20 |     def process_item(self, item, spider):
21 |         # process info for actor
22 |         articles = bytes.decode(str(item['articles']).encode('utf-8')).replace("\n", " ")
23 |         article_id = bytes.decode(str(item['article_id']).encode('utf-8'))
24 | 
25 |         self.article_file.write(article_id + "," + articles + "\n")
26 | 
27 |     def close_spider(self, spider):
28 |         self.article_file.close()
29 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for baidu_baike project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'baidu_baike'
13 | 
14 | SPIDER_MODULES = ['baidu_baike.spiders']
15 | NEWSPIDER_MODULE = 'baidu_baike.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'baidu_baike (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | #import random
32 | #DOWNLOAD_DELAY = random.randint(0, 1)
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'baidu_baike.middlewares.BaiduBaikeSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'baidu_baike.middlewares.BaiduBaikeDownloaderMiddleware': 543,
59 | #}
60 | DOWNLOADER_MIDDLEWARES = {
61 |     'baidu_baike.middlewares.RandomUserAgent': 10,
62 |     'baidu_baike.middlewares.ProxyMiddleWare': 100,
63 | }
64 | # Enable or disable extensions
65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 | 
70 | # Configure item pipelines
71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
72 | #ITEM_PIPELINES = {
73 | #    'baidu_baike.pipelines.BaiduBaikePipeline': 300,
74 | #}
75 | ITEM_PIPELINES = {     
76 |     'baidu_baike.pipelines.BaiduBaikePipeline': 300,
77 | }
78 |  
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 | 
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/baidu_baike/spiders/baidu_baike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from __future__ import absolute_import
 5 | from __future__ import division     
 6 | from __future__ import print_function
 7 | 
 8 | 
 9 | from baidu_baike.items import BaiduBaikeItem
10 | import scrapy
11 | from scrapy.http import Request
12 | from bs4 import BeautifulSoup
13 | import re
14 | import urllib
15 | 
16 | class BaiduBaikeSpider(scrapy.Spider, object):
17 |     name = 'baidu'
18 |     allowed_domains = ["baike.baidu.com"]
19 |     start_urls = ['https://baike.baidu.com/item/%E5%91%A8%E6%98%9F%E9%A9%B0/169917?fr=aladdin']
20 |     global article_id
21 |     article_id = 0
22 | #    start_urls = ['https://baike.baidu.com/item/%E4%B8%83%E5%B0%8F%E7%A6%8F']
23 |     
24 |     def _get_from_findall(self, tag_list):
25 |         result = []        
26 |                            
27 |         for slist in tag_list:
28 |             tmp = slist.get_text()
29 |             result.append(tmp)
30 |         return result
31 | 
32 |     def parse(self, response):
33 |         global article_id
34 |         page_category = response.xpath("//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract()
35 |         page_category = [l.strip() for l in page_category]
36 |         item = BaiduBaikeItem()
37 | 
38 |         item['article_id'] = article_id
39 |         item['articles'] = ''
40 | 
41 |         if u'演员' in page_category or u'电影' in page_category:
42 |             print("Get a actor/movie page")
43 |             soup = BeautifulSoup(response.text, 'lxml')
44 |             root_node = soup.find("div", class_ = "main_tab main_tab-defaultTab curTab")
45 | 
46 |             para_nodes = soup.find_all("div", class_="para")
47 |             basic_item = self._get_from_findall(para_nodes)
48 |             article_content = ' '.join(basic_item)
49 |             article_content = article_content.replace("\n", " ")
50 |             item['articles'] = str(article_content)
51 |             article_id += 1
52 |             yield item
53 |             if article_id % 50 == 0:
54 |                 print("The nums of total articles up to: {}".format(article_id))
55 | 
56 | 
57 |         soup = BeautifulSoup(response.text, 'lxml')
58 |         links = soup.find_all('a', href=re.compile(r"/item/"))
59 |         for link in links:
60 |             new_url = link["href"]
61 |             new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url)
62 |             yield scrapy.Request(new_full_url, callback=self.parse)
63 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/baidu_baike/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = baidu_baike.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baidu_baike
12 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/extract_play_features.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from deepdive import *
 5 | import ddlib
 6 | 
 7 | @tsv_extractor
 8 | @returns(lambda
 9 |         p1_id   = "text",
10 |         p2_id   = "text",
11 |         feature = "text",
12 |     :[])
13 | def extract(
14 |         p1_id          = "text",
15 |         p2_id          = "text",
16 |         p1_begin_index = "int",
17 |         p1_end_index   = "int",
18 |         p2_begin_index = "int",
19 |         p2_end_index   = "int",
20 |         doc_id         = "text",
21 |         sent_index     = "int",
22 |         tokens         = "text[]",
23 |         lemmas         = "text[]",
24 |         pos_tags       = "text[]",
25 |         ner_tags       = "text[]",
26 |         dep_types      = "text[]",
27 |         dep_parents    = "int[]",
28 |     ):
29 |     """
30 |     Uses DDLIB to generate features for the spouse relation.
31 |     """
32 |     # Create a DDLIB sentence object, which is just a list of DDLIB Word objects
33 |     sent = []
34 |     for i,t in enumerate(tokens):
35 |         sent.append(ddlib.Word(
36 |             begin_char_offset=None,
37 |             end_char_offset=None,
38 |             word=t,
39 |             lemma=lemmas[i],
40 |             pos=pos_tags[i],
41 |             ner=ner_tags[i],
42 |             dep_par=dep_parents[i] - 1,  # Note that as stored from CoreNLP 0 is ROOT, but for DDLIB -1 is ROOT
43 |             dep_label=dep_types[i]))
44 | 
45 |     # Create DDLIB Spans for the two person mentions
46 |     p1_span = ddlib.Span(begin_word_id=p1_begin_index, length=(p1_end_index-p1_begin_index+1))
47 |     p2_span = ddlib.Span(begin_word_id=p2_begin_index, length=(p2_end_index-p2_begin_index+1))
48 | 
49 |     # Generate the generic features using DDLIB
50 |     for feature in ddlib.get_generic_features_relation(sent, p1_span, p2_span):
51 |         yield [p1_id, p2_id, feature]
52 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/get_actor_movie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | try:
 8 |     import simplejson as json
 9 | except:
10 |     import json
11 | 
12 | import pymysql
13 | from pymysql import connections
14 | from collections import defaultdict
15 | 
16 | 
17 | class connec_mysql(object):
18 |     def __init__(self):
19 |         self.conn = pymysql.connect(
20 |             host='localhost',
21 |             user='root',
22 |             passwd='root',
23 |             db='baidu_baike',
24 |             charset='utf8mb4',
25 |             use_unicode=True
26 |         )
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def get_actor_movie(self, filename, out_name):
30 |         outfile = open(out_name,'w',encoding='UTF-8')
31 |         with open(filename) as f:
32 |             lines = f.readlines()
33 |             for line in lines:
34 |                 words = line.strip().split()
35 |                 if len(words) != 2:
36 |                     print("Got line with wrong fromat~")
37 |                     continue
38 |                 actor_id = words[0]
39 |                 movie_id = words[1]
40 |                 self.cursor.execute(
41 |                     "SELECT actor_chName, actor_foreName FROM actor WHERE actor_id = {}".format(actor_id))
42 |                 actor_list = self.cursor.fetchall()
43 |                 actor_chName, actor_foreName = actor_list[0]
44 |                 self.cursor.execute(
45 |                     "SELECT movie_chName, movie_foreName FROM movie WHERE movie_id = {}".format(movie_id))
46 |                 movie_list = self.cursor.fetchall()
47 |                 movie_chName, movie_foreName = movie_list[0]
48 |                 for item_actor in [actor_chName, actor_foreName]:
49 |                     for item_movie in [movie_chName, movie_foreName]:
50 |                         if item_actor not in ["None", ""] and item_movie not in ["None", ""]:
51 |                             outfile.write(item_actor + "," + item_movie + "\n")
52 | 
53 |         outfile.close()
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     connect_sql = connec_mysql()
58 |     connect_sql.get_actor_movie("../input/actor_movie.txt", "../input/actor_movie_dbdata.csv")
59 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/map_actor_mention.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from deepdive import *
 5 | import re
 6 | 
 7 | @tsv_extractor
 8 | @returns(lambda
 9 |          mention_id     = "text",
10 |          mention_text   = "text",
11 |          doc_id         = "text",
12 |          sentence_index = "int",
13 |          begin_index    = "int",
14 |          end_index      = "int",
15 |          :[])
16 | def extract(
17 |         doc_id          = "text",
18 |         sentence_index  = "int",
19 |         tokens          = "text[]",
20 |         pos_tags        = "text[]",
21 |         ner_tags        = "text[]",
22 |     ):
23 |     """
24 |     Finds phrases thar are continuous words with POS tags == MISC and NER tags == NN.
25 |     We make this decision due to stanford parser got bad performance when recognizing actor.
26 |     """
27 |     num_tokens = len(ner_tags)
28 |     first_index = ( i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and pos_tags[i] == "NR" and (i == 0 or (ner_tags[i-1] != "PERSON" and pos_tags[i-1] != "NR" )) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[i], "utf-8")) != None)
29 |     for begin_index in first_index:
30 |         end_index = begin_index + 1
31 |         while end_index < num_tokens and ner_tags[end_index] == "PERSON" and pos_tags[end_index] == "NR" and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[end_index], "utf-8")) != None:
32 |             end_index += 1
33 |         end_index -= 1
34 |         mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
35 |         mention_text = "".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1)))
36 | 
37 |         yield [
38 |             mention_id,
39 |             mention_text,
40 |             doc_id,
41 |             sentence_index,
42 |             begin_index,
43 |             end_index,
44 |         ]
45 | 
46 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/map_movie_mention.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from deepdive import *
 5 | import re
 6 | 
 7 | @tsv_extractor
 8 | @returns(lambda
 9 |          mention_id     = "text",
10 |          mention_text   = "text",
11 |          doc_id         = "text",
12 |          sentence_index = "int",
13 |          begin_index    = "int",
14 |          end_index      = "int",
15 |          :[])
16 | def extract(
17 |         doc_id          = "text",
18 |         sentence_index  = "int",
19 |         tokens          = "text[]",
20 |         pos_tags        = "text[]",
21 |         ner_tags        = "text[]",
22 |     ):
23 |     """
24 |     Finds phrases thar are continuous words with POS tags == MISC and NER tags == NN.
25 |     We make this decision due to stanford parser got bad performance when recognizing movie.
26 |     """
27 |     num_tokens = len(ner_tags)
28 |     first_index = ( i for i in xrange(num_tokens) if ner_tags[i] == "MISC" and pos_tags[i] == "NN" and (i == 0 or (ner_tags[i-1] != "MISC" and pos_tags[i-1] != "NN" )) and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[i], "utf-8")) != None)
29 |     for begin_index in first_index:
30 |         end_index = begin_index + 1
31 |         while end_index < num_tokens and ner_tags[end_index] == "MISC" and pos_tags[end_index] == "NN" and re.match(u'^[\u4e00-\u9fa5\u3040-\u309f\u30a0-\u30ffa-zA-Z]+$', unicode(tokens[end_index], "utf-8")) != None:
32 |             end_index += 1
33 |         end_index -= 1
34 |         mention_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
35 |         mention_text = "".join(map(lambda i: tokens[i], xrange(begin_index, end_index + 1)))
36 | 
37 |         yield [
38 |             mention_id,
39 |             mention_text,
40 |             doc_id,
41 |             sentence_index,
42 |             begin_index,
43 |             end_index,
44 |         ]
45 | 
46 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/map_play_candidate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from deepdive import *
 4 | import re
 5 | 
 6 | @tsv_extractor    
 7 | @returns(lambda   
 8 |         p1_id       = "text",
 9 |         p1_name     = "text",
10 |         p2_id           = "text",
11 |         p2_name  = "text",
12 |     :[])  
13 | def extract(      
14 |         p1_id       = "text",
15 |         p1_name     = "text",
16 |         p2_id           = "text",
17 |         p2_name   = "text",
18 |     ): 
19 |     if not(set(p1_name) <= set(p2_name) or set(p2_name) <= set(p1_name)):
20 |         yield [           
21 |             p1_id,        
22 |             p1_name,      
23 |             p2_id,        
24 |             p2_name,      
25 |         ]       
26 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/nlp_markup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # A shell script that runs Bazaar/Parser over documents passed as input TSV lines
 3 | #
 4 | # $ deepdive env udf/nlp_markup.sh doc_id _ _ content _
 5 | ##
 6 | set -euo pipefail
 7 | cd "$(dirname "$0")"
 8 | 
 9 | : ${BAZAAR_HOME:=$PWD/bazaar}
10 | [[ -x "$BAZAAR_HOME"/parser/target/start ]] || {
11 |     echo "No Bazaar/Parser set up at: $BAZAAR_HOME/parser"
12 |     exit 2
13 | } >&2
14 | 
15 | [[ $# -gt 0 ]] ||
16 |     # default column order of input TSV
17 |     set -- doc_id content
18 | 
19 | # convert input tsv lines into JSON lines for Bazaar/Parser
20 | 
21 | 
22 | # start Bazaar/Parser to emit sentences TSV
23 | tsv2json "$@" |
24 | "$BAZAAR_HOME"/parser/run.sh -i json -k doc_id -v content
25 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/supervise_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from deepdive import *        
 5 | import random                 
 6 | from collections import namedtuple
 7 |                               
 8 | PlayLabel = namedtuple('PlayLabel', 'p1_id, p2_id, label, type')
 9 |                               
10 | @tsv_extractor                
11 | @returns(lambda               
12 |         p1_id   = "text",     
13 |         p2_id   = "text",     
14 |         label   = "int",      
15 |         rule_id = "text",     
16 |     :[])                      
17 | # heuristic rules for finding positive/negative examples of play relationship mentions
18 | def supervise(                
19 |         p1_id="text", p1_begin="int", p1_end="int",
20 |         p2_id="text", p2_begin="int", p2_end="int",
21 |         doc_id="text", sentence_index="int", sentence_text="text",
22 |         tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags="text[]",
23 |         dep_types="text[]", dep_token_indexes="int[]",
24 |     ):
25 |     PLAY = frozenset(["出演", "主演", "参演", "友情出演", "饰演", "特别出演"])
26 | 
27 |     COMMAS = frozenset([":", "：","1","2","3","4","5","6","7","8","9","0","、", ";", "；"])
28 |     MAX_DIST = 40
29 | 
30 |     # Common data objects
31 |     intermediate_lemmas = lemmas[p1_end+1:p2_begin]
32 |     intermediate_ner_tags = ner_tags[p1_end+1:p2_begin]
33 |     tail_lemmas = lemmas[p2_end+1:]
34 |     play = PlayLabel(p1_id=p1_id, p2_id=p2_id, label=None, type=None)
35 | 
36 |     if len(intermediate_lemmas) > MAX_DIST:
37 |         yield play._replace(label=-1, type='neg:far_apart')
38 |     
39 |     if 'PERSON' in intermediate_ner_tags:
40 |         yield play._replace(label=-1, type='neg:third_person_between')
41 | 
42 |     if len(COMMAS.intersection(intermediate_lemmas)) > 0:
43 |         yield play._replace(label=-1, type='neg:中间有特殊符号')
44 | 
45 |     if len(PLAY.intersection(intermediate_lemmas)) > 0:
46 |         yield play._replace(label=1, type='pos:A出演B')
47 | 


--------------------------------------------------------------------------------
/ie/deepdive/udf/trans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | 
 5 | with open("baidu_baike/articles.txt",encoding='utf-8') as f:
 6 |     with open("./articles.csv", "w+",encoding='utf-8') as o:
 7 |         lines = f.readlines()
 8 |         for line in lines:
 9 |             words = line.strip().split(",")
10 |             id = words[0]
11 |             content = words[1:]
12 |             text = '，'.join(content)
13 |             text = text.replace(",", "，")
14 |             o.write(id + "," + text + "\n")
15 | 
16 | 


--------------------------------------------------------------------------------
/ie/re_cnn_att/clean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | import re
 4 | import commands
 5 | 
 6 | 
 7 | class Clean(object):
 8 |     @staticmethod
 9 |     def clean_word(word, clean_level='others'):
10 |         """
11 |         Remove symbols in words
12 |         :word word with unicode
13 |         :clean_level keep different symbols for disambi/title
14 |         :return clean word
15 |         """
16 |         word = word.strip()
17 | 
18 |         if clean_level == "title":
19 |             word = word.strip().strip("\"").replace("\n", " ").replace("\"", "").strip(u"\\")
20 |         elif clean_level == "subject":
21 |             word = word.replace("\"", "").strip("\\").strip()
22 |         elif clean_level == "redirect":
23 |             word = word.strip("\"")
24 |         elif clean_level == "disambi":
25 |             word = re.sub(
26 |                 u"[，。、＆∈＊．↑【２—‘：“＃＞ ＢＦＲ·Ｚ<ｂｆ≈ｊ×～①Ⅲ⑤⑨÷〔！％》－』１→５＝ＡＥ∧Ｉ/″▲;］ξａｅφｉ｝④⑧…─☆《『０В＜Ｄ∪Ｌ±γ′ＴＸλ:ｄｈ｜③⑦~、℃＇〉＋」／】３〕Δ’；”？■ＣＧΨ［=μ＿ｃｇβ㈧ｏ｛②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\, \xa0\u00a0\u3000]",
27 |                 "", word)
28 |         elif clean_level == 'others':
29 |             word = re.sub(
30 |                 u"[，。、＆∈＊．↑【２—‘：“＃＞ ＢＦＲ·Ｚ<ｂｆ≈ｊ×～①Ⅲ⑤⑨÷〔！％）》－』１→５＝ＡＥ∧Ｉ/″▲;］ξａｅφｉ｝④⑧…─☆（《『０В＜Ｄ∪Ｌ±γ′ＴＸλ:ｄｈ｜③⑦~、℃＇〉＋」／】３〕Δ’；”？■ＣＧΨ［=μ＿ｃｇβ㈧ｏ｛②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\,\xa0\u00a0\u3000\r\n]",
31 |                 "", word)
32 |         return word
33 | 
34 | 
35 | class ProcessFile(object):
36 |     @staticmethod
37 |     def get_line(self, filename):
38 |         total_lines = commands.getoutput("sed -n '$=' {}".format(filename))
39 |         with open(filename, "r", encoding='utf-8') as inf:
40 |             for line_num in range(total_lines):
41 |                 line = inf.readline().strip()
42 |                 yield line
43 | 


--------------------------------------------------------------------------------
/ie/re_cnn_att/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/ie/re_cnn_att/data/__init__.py


--------------------------------------------------------------------------------
/ie/re_cnn_att/word2vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from gensim.models import word2vec
 4 | from tqdm import tqdm
 5 | import json
 6 | import os
 7 | import jieba
 8 | from gen_re_from_baidu import LoadFile
 9 | 
10 | 
11 | def cut_words(line):
12 |     seg_line = " ".join(jieba.cut(line))
13 |     return seg_line
14 | 
15 | 
16 | def seg_file(infile="", outfile=""):
17 |     with open(outfile, "w", encoding='utf-8') as ouf:
18 |         for line in tqdm(LoadFile.readline(infile)):
19 |             seg_line = cut_words(line)
20 |             ouf.write(seg_line)
21 | 
22 | 
23 | def transfer_json(in_file_path, out_file_path):
24 |     with open(in_file_path, "r", encoding='utf-8') as inf:
25 |         ouf = open(out_file_path, "w", encoding='utf-8')
26 |         word_embed_list = []
27 |         word_num, dim = inf.readline().strip().split()
28 |         print("Total word_num: ", word_num, "\nWord dim: ", dim)
29 |         for line_num in tqdm(range(int(word_num))):
30 |             word_dict = {}
31 |             words = inf.readline().strip().split()
32 |             word_dict["word"] = words[0]
33 |             word_dict["vec"] = eval("[" + ",".join(words[1:]) + "]")
34 |             word_embed_list.append(word_dict)
35 |         json.dump(word_embed_list, ouf)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     seg_file(infile="data/6w_clean_disambi_text.csv", outfile="seg_6w_disambi_text.txt")
40 |     os.system("C:\my\word2vec\word2vec  -train seg_6w_disambi_text.txt -output word_vec.txt -size 50 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 0 -iter 3 -min-count 1 -hs 1")
41 |     transfer_json("word_vec.txt", "word_vec.json")
42 | 
43 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | from clean import Clean
4 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/clean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | import re
 4 | import commands
 5 | 
 6 | class Clean(object):
 7 |     @staticmethod
 8 |     def clean_word(word, clean_level='others'):
 9 |         """
10 |         Remove symbols in words
11 |         :word word with unicode
12 |         :clean_level keep different symbols for disambi/title
13 |         :return clean word
14 |         """
15 |         word = word.strip()
16 | 
17 |         if clean_level == "title":
18 |             word = word.strip().strip("\"").replace("\n", " ").replace("\"","").strip(u"\\")
19 |         elif clean_level == "subject":
20 |             word = word.replace("\"", "").strip("\\").strip()
21 |         elif clean_level == "redirect":
22 |             word = word.strip("\"")
23 |         elif clean_level == "disambi":
24 |             word = re.sub(u"[，。、＆∈＊．↑【２—‘：“＃＞ ＢＦＲ·Ｚ<ｂｆ≈ｊ×～①Ⅲ⑤⑨÷〔！％》－』１→５＝ＡＥ∧Ｉ/″▲;］ξａｅφｉ｝④⑧…─☆《『０В＜Ｄ∪Ｌ±γ′ＴＸλ:ｄｈ｜③⑦~、℃＇〉＋」／】３〕Δ’；”？■ＣＧΨ［=μ＿ｃｇβ㈧ｏ｛②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\, \xa0\u00a0\u3000\r\n]", "", word)
25 |         elif clean_level == 'others':
26 |             word = re.sub(u"[，。、＆∈＊．↑【２—‘：“＃＞ ＢＦＲ·Ｚ<ｂｆ≈ｊ×～①Ⅲ⑤⑨÷〔！％）》－』１→５＝ＡＥ∧Ｉ/″▲;］ξａｅφｉ｝④⑧…─☆（《『０В＜Ｄ∪Ｌ±γ′ＴＸλ:ｄｈ｜③⑦~、℃＇〉＋」／】３〕Δ’；”？■ＣＧΨ［=μ＿ｃｇβ㈧ｏ｛②⑥'⑩。\~\!\@\#\$\%\^\&\*\(\)\_\-\+\=\{\}\[\]\\\|\:\;\'\"\.\>\?\/\,\xa0\u00a0\u3000\r\n]", "", word)
27 |         return word
28 | 
29 | class ProcessFile(object):
30 |     @staticmethod
31 |     def get_line(self, filename):
32 |         total_lines = commands.getoutput("sed -n '$=' {}".format(filename))
33 |         with open(filename) as inf:
34 |             for line_num in range(total_lines):
35 |                 line = inf.readline().strip()
36 |                 yield line
37 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/cleanFile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from tqdm import tqdm
 4 | from clean import Clean
 5 | def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"):
 6 |     with open(infile, "r",encoding='utf-8') as inf:
 7 |         lines = inf.readlines()
 8 |         err_counts = 0
 9 |         with open(outfile, "w",encoding='utf-8') as ouf:
10 |             for line in tqdm(lines):
11 |                 words = line.strip().split("\",\"")
12 |                 if len(words) != 2:
13 |                     err_counts += 1
14 |                     continue
15 |                 title = Clean.clean_word(words[0], clean_level='title')
16 |                 disambi = Clean.clean_word(words[1], clean_level='disambi')
17 |                 ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n")
18 |             print("err_counts for disambi_redirect: ", err_counts)
19 | 
20 | 
21 | def clean_disambi_redirect(infile="disambi_redirect.csv", outfile="disambi_redirect_out.csv"):
22 |     with open(infile, "r",encoding='utf-8') as inf:
23 |         lines = inf.readlines()
24 |         err_counts = 0
25 |         with open(outfile, "w",encoding='utf-8') as ouf:
26 |             for line in tqdm(lines):
27 |                 words = line.strip().split("\",\"")
28 |                 if len(words) != 2:
29 |                     err_counts += 1
30 |                     continue
31 |                 disambi = Clean.clean_word(words[0], clean_level='disambi')
32 |                 redirect = Clean.clean_word(words[1], clean_level='redirect')
33 |                 ouf.write("\"" + disambi + "\",\"" + redirect + "\"\r\n")
34 |             print("err_counts for disambi_redirect: ", err_counts)
35 | 
36 | 
37 | def clean_disambi_subject(infile="disambi_subject.csv", outfile="disambi_subject_out.csv"):
38 |     with open(infile, "r",encoding='utf-8') as inf:
39 |         lines = inf.readlines()
40 |         err_counts = 0
41 |         with open(outfile, "w",encoding='utf-8') as ouf:
42 |             for line in tqdm(lines):
43 |                 words = line.strip().split("\",\"")
44 |                 if len(words) != 2:
45 |                     err_counts += 1
46 |                     continue
47 |                 disambi = Clean.clean_word(words[0], clean_level='disambi')
48 |                 subject = Clean.clean_word(words[1], clean_level='subject')
49 |                 ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
50 |             print("err_counts for disambi_redirect: ", err_counts)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     clean_title_disambi(infile="./410_baidu/410_title_disambi.csv", outfile="./410_baidu/410_title_disambi_out.csv")
55 |     clean_disambi_redirect(infile="./410_baidu/410_disambi_redirect.csv", outfile="./410_baidu/410_disambi_redirect_out.csv")
56 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/gen_disambi_infobox.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | import re
 4 | import json
 5 | import re
 6 | from tqdm import tqdm
 7 | from clean import Clean
 8 | 
 9 | def get_word_list(filename):
10 |     with open(filename, "r",encoding='utf-8') as inf:
11 |         lines = inf.readlines()
12 | #        print "type line: ", type(lines[0].encode("utf-8"))
13 |         lines = [Clean.clean_word(line, clean_level='title') for line in lines]
14 |         return lines
15 | 
16 | 
17 | print(Clean.clean_word(u"\"你好   呀#\"$%^&*@!，。、；：‘’】季    候【"))
18 | 
19 | 
20 | def main():
21 |     with open("./410_baidu/410_disambi_infobox.csv",'r',encoding='UTF-8') as inf:
22 |         lines = inf.readlines()
23 |         f = open("./410_baidu/410_disambi_infobox_out.csv", "w",encoding='utf-8')
24 |         list_attr = []
25 |         title_list = get_word_list("./410_baidu/410_title.csv")
26 |         err_count = 0
27 |         counts = {}
28 |         for line in tqdm(lines):
29 |             words = line.strip().split(",")
30 |             disambi = Clean.clean_word(words[0], clean_level='disambi')
31 |             infobox = ",".join(words[1:])
32 |             try:
33 |                 info_dict = json.loads(json.loads(infobox))
34 |                 for attr in info_dict.keys():
35 |                     clean_attr = Clean.clean_word(attr)
36 |                     info_dict[clean_attr] = info_dict.pop(attr)
37 |                     value = info_dict[clean_attr]
38 |                     clean_attr = clean_attr
39 |                     counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1
40 |                     list_attr.append(clean_attr)
41 |                     value_split = re.split(u"[，。、,/]", value.strip())
42 |                     for v in value_split:
43 |                         v = Clean.clean_word(v).strip(u"等").strip(u"收起")
44 |                         title_list.append(v)
45 |                         f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n")
46 |             except Exception as e:
47 |                 print(e)
48 |                 err_count += 1
49 |         title_list = [t.strip(u"\\") for t in title_list]
50 |         title_list = list(set(title_list))
51 |         list_attr = list(set(list_attr))
52 |         sort_counts = sorted(counts.items(),key = lambda x:x[1],reverse = True)
53 |         with open("./sort_counts.txt", "w",encoding='utf-8') as ouf:
54 |             for i in sort_counts:
55 |                 ouf.write(str(i) + "\n")
56 |         with open("./all_attr.txt", "w",encoding='utf-8') as ouf:
57 |             for word_counts in sort_counts:
58 |                 if  word_counts[1] >= 10:
59 |                     ouf.write(str(word_counts[0]) + "\n")
60 |         with open("./410_baidu/410_title_new.csv", "w",encoding='utf-8') as ouf:
61 |             for i in title_list:
62 |                 ouf.write("\"" + i + "\"\r\n")
63 |         with open("./410_baidu/all_attr.txt", "w",encoding='utf-8') as ouf:
64 |             for i in list_attr:
65 |                 ouf.write(i + "\n")
66 | 
67 |         print("err_count: ", err_count)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/get_subject.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from collections import defaultdict
 5 | from clean import Clean
 6 | from tqdm import tqdm
 7 | 
 8 | with open("./410_baidu/410_disambi_subject.csv", "r",encoding='utf-8') as inf:
 9 |     lines = inf.readlines()
10 | #    all_subject = defaultdict(list)
11 |     total_subject = []
12 |     f = open("./410_baidu/disambi_subject.csv", "w",encoding='utf-8')
13 |     for line in tqdm(lines):
14 |         words = line.strip().split(",")
15 |         disambi = Clean.clean_word(words[0], clean_level='disambi')
16 |         subjects = words[1:]
17 |         subjects = [Clean.clean_word(s, clean_level="subject") for s in subjects]
18 | #        subjects = [s.replace("\"", "").strip("\\") for s in subjects]
19 | #        subjects = [s.strip() for s in subjects]
20 |         total_subject.extend(subjects)
21 |         for subject in subjects:
22 |             if subject == "":
23 |                 continue
24 |             f.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
25 | #        all_subject[disambi].append(subjects)
26 |     f.close()
27 |     total_subject = list(set(total_subject))
28 |     print("Total subjects: ", len(total_subject))
29 |     with open("./410_baidu/all_subject.csv", "w",encoding='utf-8') as ouf:
30 |         ouf.write("\"" + "\"\n\"".join(total_subject) + "\"")
31 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/disambi_headers.csv:
--------------------------------------------------------------------------------
1 | disambi:ID(Disambi),title,abstract,curLink,exterLink


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/disambi_infobox_header.csv:
--------------------------------------------------------------------------------
1 | :START_ID(Disambi),role,:END_ID(Title)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/disambi_redirect_header.csv:
--------------------------------------------------------------------------------
1 | :START_ID(Disambi),:END_ID(Redirect)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/disambi_subject_header.csv:
--------------------------------------------------------------------------------
1 | :START_ID(Disambi),:END_ID(Subject)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/redirect_header.csv:
--------------------------------------------------------------------------------
1 | redirect:ID(Redirect)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/subject_header.csv:
--------------------------------------------------------------------------------
1 | subject:ID(Subject)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/title_disambi_header.csv:
--------------------------------------------------------------------------------
1 | :START_ID(Disambi),:END_ID(Title)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/header_file/title_header.csv:
--------------------------------------------------------------------------------
1 | title:ID(Title)


--------------------------------------------------------------------------------
/ie/struct_to_rdf/baidu2neo4j/remove_disambi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | import re
 4 | from clean import Clean
 5 | from tqdm import tqdm
 6 | 
 7 | with open("./410_baidu/410_disambi.csv", "r",encoding='utf-8') as inf:
 8 |     title_dict = {}
 9 |     count = 0
10 |     lines = inf.readlines()
11 |     for line in tqdm(lines):
12 |         words = line.strip().split("\",\"")
13 |         if len(words) != 4:
14 |             count += 1
15 |         clean_disambi = Clean.clean_word(words[0], 'disambi')
16 |         title_dict[clean_disambi] = words[1:]
17 |     print("Error lines: ", count)
18 |     with open("./410_baidu/410_disambi_new.csv", "w",encoding='utf-8') as ouf:
19 |         for i in title_dict.keys():
20 |             ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) + "\r\n")
21 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/movie_actor/clean_actor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | """
 5 | Get the table of actor_to_actor and actor_to_genre.
 6 | """
 7 | from __future__ import absolute_import
 8 | from __future__ import division    
 9 | from __future__ import print_function
10 |                     
11 | import sys          
12 | reload(sys)         
13 | sys.setdefaultencoding('utf-8')
14 |                     
15 | import pymysql      
16 | from pymysql import connections
17 | import numpy as np
18 | import re
19 | 
20 | class connec_mysql(object):
21 |     def __init__(self):    
22 |         self.conn = pymysql.connect(
23 |             host='localhost',
24 |             user='root',
25 |             passwd='nlp',
26 |             db='hudong_baike',
27 |             charset='utf8mb4',
28 |             use_unicode=True
29 |             )              
30 |         self.cursor = self.conn.cursor()
31 | 
32 |     def process_actor_gen(self):
33 |         actor_gen_id = 0
34 |         self.cursor.execute("SELECT MAX(actor_id) FROM actor_back")
35 |         max_actor_id = self.cursor.fetchall()[0][0]
36 |         assert isinstance(max_actor_id, int)
37 |         for actor_id in range(1, max_actor_id + 1):
38 | #        for actor_id in range(1, 1 + 10):
39 |             self.cursor.execute("SELECT * FROM actor_back WHERE actor_id = {};".format(actor_id))
40 |             result = self.cursor.fetchall()
41 |             if np.shape(result) != (1, 11):
42 |                 continue
43 |             new_actor_list = [ result[0][i].replace(u'title="" href=""', "") if not isinstance(result[0][i], int) else result[0][i] for i in range(0, 11) ]
44 |             new_actor_list = [ new_actor_list[i].strip(u' 《》') if not isinstance(new_actor_list[i], int) else new_actor_list[i] for i in range(0, 11) ]
45 |             new_actor_tuple = tuple(new_actor_list)
46 |             sql = """ 
47 |                 INSERT INTO actor(  actor_id, actor_bio, actor_chName, actor_foreName, actor_nationality, actor_constellation, actor_birthPlace, actor_birthDay, actor_repWorks, actor_achiem, actor_brokerage ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
48 |                 """
49 |             self.cursor.execute(sql, new_actor_tuple)
50 |             self.conn.commit()
51 | 
52 | if __name__ == '__main__':
53 |     connec = connec_mysql()
54 |     connec.process_actor_gen()
55 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/movie_actor/clean_mysql.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | """
 5 | Get the table of actor_to_movie and movie_to_genre.
 6 | """
 7 | from __future__ import absolute_import
 8 | from __future__ import division    
 9 | from __future__ import print_function
10 |                     
11 | import sys          
12 | reload(sys)         
13 | sys.setdefaultencoding('utf-8')
14 |                     
15 | import pymysql      
16 | from pymysql import connections
17 | import numpy as np
18 | import re
19 | 
20 | class connec_mysql(object):
21 |     def __init__(self):    
22 |         self.conn = pymysql.connect(
23 |             host='localhost',
24 |             user='root',
25 |             passwd='nlp',
26 |             db='hudong_baike',
27 |             charset='utf8mb4',
28 |             use_unicode=True
29 |             )              
30 |         self.cursor = self.conn.cursor()
31 | 
32 |     def process_movie_gen(self):
33 |         movie_gen_id = 0
34 |         self.cursor.execute("SELECT MAX(movie_id) FROM movie_back")
35 |         max_movie_id = self.cursor.fetchall()[0][0]
36 |         assert isinstance(max_movie_id, int)
37 |         for movie_id in range(1, max_movie_id + 1):
38 | #        for movie_id in range(1, 1 + 1):
39 |             self.cursor.execute("SELECT * FROM movie_back WHERE movie_id = {};".format(movie_id))
40 |             result = self.cursor.fetchall()
41 |             print("np.shape(result): ", np.shape(result))
42 |             if np.shape(result) != (1, 14):
43 |                 continue
44 |             new_movie_list = [ result[0][i].strip(u" 《》") if not isinstance(result[0][i], int) else result[0][i] for i in range(0, 14) ]
45 | #            new_movie_list = [result[0][i] if i != 2 else movie_name for i in range(0, 14)]
46 |             new_movie_tuple = tuple(new_movie_list)
47 |             sql = """ 
48 |                 INSERT INTO movie(  movie_id, movie_bio, movie_chName, movie_foreName, movie_prodTime, movie_prodCompany,       movie_director, movie_screenwriter, movie_genre, movie_star, movie_length, movie_rekeaseTime, movie_language, movie_achiem )    VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
49 |                 """
50 |             self.cursor.execute(sql, new_movie_tuple)
51 |             self.conn.commit()
52 | 
53 | if __name__ == '__main__':
54 |     connec = connec_mysql()
55 |     connec.process_movie_gen()
56 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/movie_actor/complete_mysql.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | """
 5 | Get the table of actor_to_movie and movie_to_genre.
 6 | """
 7 | from __future__ import absolute_import
 8 | from __future__ import division    
 9 | from __future__ import print_function
10 |                     
11 | import sys          
12 | reload(sys)         
13 | sys.setdefaultencoding('utf-8')
14 |                     
15 | import pymysql      
16 | from pymysql import connections
17 | import numpy as np
18 | import re
19 | 
20 | class connec_mysql(object):
21 |     def __init__(self):    
22 |         self.conn = pymysql.connect(
23 |             host='localhost',
24 |             user='root',
25 |             passwd='nlp',
26 |             db='hudong_baike',
27 |             charset='utf8mb4',
28 |             use_unicode=True
29 |             )              
30 |         self.cursor = self.conn.cursor()
31 | 
32 |     def process_act_movie(self):
33 |         actor_movie_id = 0
34 |         self.cursor.execute("SELECT MAX(actor_id) FROM actor")
35 |         max_actor_id = self.cursor.fetchall()[0][0]
36 |         assert isinstance(max_actor_id, int)
37 |         for actor_id in range(1, max_actor_id + 1):
38 |             self.cursor.execute("SELECT actor_repworks FROM actor WHERE actor_id = {};".format(actor_id))
39 |             result = self.cursor.fetchall()
40 |             assert np.shape(result) == (1, 1) # if didn't exist, return (0, )
41 |             repworks = re.split(u"[，/、 ]", result[0][0] )
42 |             try:
43 |                 assert len(repworks) > 0
44 |                 for repwork in repworks:
45 |                     repwork = repwork.strip(u" 《》")
46 |                     self.cursor.execute("SELECT movie_id FROM movie WHERE movie_chName = %s", repwork)
47 |                     check_movie_id = self.cursor.fetchall()
48 |                     if len(check_movie_id) != 0:
49 |                         self.cursor.execute("INSERT INTO actor_to_movie (actor_movie_id, actor_id, movie_id) VALUES (%s, %s, %s)", (actor_movie_id, actor_id, check_movie_id[0][0]) )
50 |                         self.conn.commit()
51 |                         actor_movie_id += 1
52 |             except Exception as e:
53 |                 print("Get a error with ", e, "Maybe this actor has no represent works")
54 |                 continue
55 | 
56 |     def process_movie_gen(self):
57 |         movie_gen_id = 0
58 |         self.cursor.execute("SELECT MAX(movie_id) FROM movie")
59 |         max_movie_id = self.cursor.fetchall()[0][0]
60 |         assert isinstance(max_movie_id, int)
61 |         for movie_id in range(1, max_movie_id + 1):
62 | #        for movie_id in range(1, 1 + 10):
63 |             self.cursor.execute("SELECT movie_genre FROM movie WHERE movie_id = {};".format(movie_id))
64 |             result = self.cursor.fetchall()
65 |             if np.shape(result) != (1, 1):
66 |                 continue
67 |             movie_genres = re.split(u"[，/、 ]", result[0][0] )
68 | #            print("movie_genres: ", movie_genres)
69 |             try:
70 |                 assert len(movie_genres) > 0
71 |                 for movie_genre in movie_genres:
72 |                     self.cursor.execute("SELECT genre_id FROM genre WHERE genre_name = %s", movie_genre)
73 |                     check_genre_id = self.cursor.fetchall()
74 |                     if len(check_genre_id) != 0:
75 |                         self.cursor.execute("INSERT INTO movie_to_genre (movie_genre_id, movie_id, genre_id) VALUES (%s, %s, %s)", (movie_gen_id, movie_id, check_genre_id[0][0]) )
76 |                         self.conn.commit()
77 |                         movie_gen_id += 1
78 |             except Exception as e:
79 |                 print("Get a error with ", e)
80 |                 continue
81 | if __name__ == '__main__':
82 |     connec = connec_mysql()
83 | #    connec.process_act_movie()
84 |     connec.process_movie_gen()
85 | 


--------------------------------------------------------------------------------
/ie/struct_to_rdf/movie_actor/get_ttl.bat:
--------------------------------------------------------------------------------
 1 | @echo off&setlocal enabledelayedexpansion
 2 | 
 3 | set db=baidu_baike
 4 | set file=kg_demo_mapping_%db%.ttl
 5 | 
 6 | call generate-mapping -u root -p root -o %file% jdbc:mysql:///%db%?useSSL=false
 7 | 
 8 | :: call findstr /i /v /C:"@prefix vocab" "%file%">>%file%.bk
 9 | :: move /y %file%.bk %file%
10 | 
11 | for /f "tokens=1,* delims=:" %%b in ('findstr /n ".*" "%file%"')do (
12 | 	set "var=%%c"
13 | 	if "!var!" neq "@prefix vocab: <vocab/> ." (
14 | 		if "!var!" equ "" (
15 | 			>>%file%.bk echo,!var!) ^
16 | 		else if "!var!" equ "@prefix jdbc: <http://d2rq.org/terms/jdbc/> ." (
17 | 		  >>%file%.bk echo,!var!
18 | 			>>%file%.bk echo,@prefix : ^<http://www.kgdemo.com#^> .) ^
19 | 		else (
20 | 			echo;"!var!"|find "jdbcDSN"&&(
21 | 				>>%file%.bk echo,  d2rq:jdbcDSN ^"jdbc:mysql:///%db%?useUnicode=true^&characterEncoding=utf8^&useSSL=false^";)||(
22 | 				set "var=!var:vocab= !"
23 | 	    	set "var=!var:actor_actor=actor!"
24 | 	    	set "var=!var:movie_movie=movie!"
25 | 	    	set "var=!var:genre_genre=genre!"
26 | 	    	set "var=!var:class  :actor=class  :Actor!"
27 | 	    	set "var=!var:class  :movie=class  :Movie!"
28 | 	    	set "var=!var:class  :genre=class  :Genre!"
29 | 	    	set "var=!var:property  :actor_to_movie=property  :hasActedIn!"
30 | 	    	set "var=!var:property  :movie_to_genre=property  :hasGenre!"	    	
31 | 	    	>>%file%.bk echo,!var!))
32 | 	)
33 | )
34 | move /y %file%.bk %file%


--------------------------------------------------------------------------------
/ie/struct_to_rdf/movie_actor/get_ttl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Downloaing d2rq tools"
 4 | wget https://github.com/downloads/d2rq/d2rq/d2rq-0.8.1.tar.gz;
 5 | echo"Done"
 6 | tar -xvzf d2rq-0.8.1.tar.gz;
 7 | cd d2rq-0.8.1;
 8 | for x in {hudong_baike,baidu_baike}; do
 9 |     echo "Generating ttl and nt files for $x"
10 |     name_ttl=`echo "kg_demo_mapping_$x.ttl"`
11 |     name_nt=`echo "$x.nt"`
12 |     ./generate-mapping -u root -p nlp -o $name_ttl jdbc:mysql:///$x;
13 |     sed -i '/\@prefix vocab.* \./d' $name_ttl # delete vocab prefix    
14 |     sed -i 's/vocab/ /g' $name_ttl    
15 |     sed -i 's/actor_actor/actor/g' $name_ttl    
16 |     sed -i 's/d2rq\:jdbcDSN "jdbc\:mysql.*;/d2rq\:jdbcDSN "jdbc\:mysql\:\/\/\/hudong_baike\?useUnicode=true\&characterEncoding=utf8";/g' $name_ttl
17 |     sed -i '8a \@prefix : <http://www.kgdemo.com#> .' $name_ttl;
18 |     ./dump-rdf -o $name_nt $name_ttl; # get NTriples
19 | done
20 | 
21 | if [ $? -ne 0  ]; then
22 |         echo "Generate mapping and nt files failed. Terminated."
23 |         exit 1 
24 | 


--------------------------------------------------------------------------------
/img/actor_movie_genre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/actor_movie_genre.png


--------------------------------------------------------------------------------
/img/baike.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/baike.png


--------------------------------------------------------------------------------
/img/example_REfO_KBQA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_REfO_KBQA.png


--------------------------------------------------------------------------------
/img/example_d2rq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_d2rq.png


--------------------------------------------------------------------------------
/img/example_elastic_ss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/img/example_elastic_ss.png


--------------------------------------------------------------------------------
/knowledge_fusion/silk/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/knowledge_fusion/silk/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/knowledge_fusion/silk/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/silk.iml" filepath="$PROJECT_DIR$/.idea/silk.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/knowledge_fusion/silk/.idea/silk.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/knowledge_fusion/silk/.idea/workspace.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ChangeListManager">
 4 |     <list default="true" id="a8dbe0dd-3246-4939-a4c2-c5a375bce847" name="Default Changelist" comment="" />
 5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 6 |     <option name="SHOW_DIALOG" value="false" />
 7 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 8 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 9 |     <option name="LAST_RESOLUTION" value="IGNORE" />
10 |   </component>
11 |   <component name="ProjectId" id="1TVa2dxbN0PLtjFBgt3NqKB3wzp" />
12 |   <component name="PropertiesComponent">
13 |     <property name="WebServerToolWindowFactoryState" value="false" />
14 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
15 |   </component>
16 |   <component name="RunDashboard">
17 |     <option name="ruleStates">
18 |       <list>
19 |         <RuleState>
20 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
21 |         </RuleState>
22 |         <RuleState>
23 |           <option name="name" value="StatusDashboardGroupingRule" />
24 |         </RuleState>
25 |       </list>
26 |     </option>
27 |   </component>
28 |   <component name="RunManager">
29 |     <configuration name="run" type="PythonConfigurationType" factoryName="Python" temporary="true">
30 |       <module name="silk" />
31 |       <option name="INTERPRETER_OPTIONS" value="" />
32 |       <option name="PARENT_ENVS" value="true" />
33 |       <envs>
34 |         <env name="PYTHONUNBUFFERED" value="1" />
35 |       </envs>
36 |       <option name="SDK_HOME" value="" />
37 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
38 |       <option name="IS_MODULE_SDK" value="true" />
39 |       <option name="ADD_CONTENT_ROOTS" value="true" />
40 |       <option name="ADD_SOURCE_ROOTS" value="true" />
41 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
42 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/run.py" />
43 |       <option name="PARAMETERS" value="" />
44 |       <option name="SHOW_COMMAND_LINE" value="false" />
45 |       <option name="EMULATE_TERMINAL" value="false" />
46 |       <option name="MODULE_MODE" value="false" />
47 |       <option name="REDIRECT_INPUT" value="false" />
48 |       <option name="INPUT_FILE" value="" />
49 |       <method v="2" />
50 |     </configuration>
51 |     <recent_temporary>
52 |       <list>
53 |         <item itemvalue="Python.run" />
54 |       </list>
55 |     </recent_temporary>
56 |   </component>
57 |   <component name="SvnConfiguration">
58 |     <configuration />
59 |   </component>
60 |   <component name="TaskManager">
61 |     <task active="true" id="Default" summary="Default task">
62 |       <changelist id="a8dbe0dd-3246-4939-a4c2-c5a375bce847" name="Default Changelist" comment="" />
63 |       <created>1573547510765</created>
64 |       <option name="number" value="Default" />
65 |       <option name="presentableId" value="Default" />
66 |       <updated>1573547510765</updated>
67 |       <workItem from="1573547511942" duration="361000" />
68 |     </task>
69 |     <servers />
70 |   </component>
71 |   <component name="TypeScriptGeneratedFilesManager">
72 |     <option name="version" value="1" />
73 |   </component>
74 |   <component name="com.intellij.coverage.CoverageDataManagerImpl">
75 |     <SUITE FILE_PATH="coverage/silk$run.coverage" NAME="run Coverage Results" MODIFIED="1573548027214" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
76 |   </component>
77 | </project>


--------------------------------------------------------------------------------
/knowledge_fusion/silk/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import requests
 5 | import commands
 6 | import math
 7 | from tqdm import tqdm
 8 | from batch_link import * 
 9 | import time 
10 | import os
11 | import subprocess
12 | import argparse
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--fuseki', type=str, 
16 |                     default='/home1/peng/project/apache-jena-fuseki-3.7.0/', help='Path to fuseki-server')
17 | parser.add_argument('--baiduNt', type=str ,
18 |                   default="/home1/peng/project/d2rq-0.8.1/full_nt/baidu_baike.nt" , help='Path to baidu N-triples ')
19 | parser.add_argument('--hudongNt', type=str ,
20 |                   default="/home1/peng/project/d2rq-0.8.1/full_nt/hudong_baike.nt" , help='Path to hudong N-triples')
21 | parser.add_argument('--maxNtLength', type=float ,
22 |                   default=5000000.0 , help='Max N-triples in each nt file')
23 | parser.add_argument('--ip', type=str ,
24 |                   default='localhost' , help='Ip for Fuseki and Silk server')
25 | parser.add_argument('--projectName', type=str ,
26 |                   default='baike' , help='Silk project name')
27 | args = parser.parse_args()
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     baidu_name = []
32 |     hudong_name = []
33 |     jm = JenaCmd()
34 |     nts = [args.baiduNt, args.hudongNt]
35 | #    nts = ["/home1/peng/project/d2rq-0.8.1/1_nt/baidu_1.nt", "/home1/peng/project/d2rq-0.8.1/1_nt/hudong_1.nt"]
36 |     subprocess.Popen(['sh', os.path.join(args.fuseki, 'fuseki-server')], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
37 |     time.sleep(5)
38 |     for idx, nt in enumerate(nts):
39 |         print("nt", nt)
40 |         out_file_list = seg_nt(nt, max_len=args.maxNtLength)
41 |         for file in out_file_list:
42 |             dbName = file.split("/")[-1].strip(".nt")
43 |             if idx == 0:
44 |                 baidu_name.append(dbName)
45 |             else:
46 |                 hudong_name.append(dbName)
47 |             jm.delete_tdb(dbName=dbName)
48 |             jm.add_tdb(dbName=dbName)
49 |             JenaCmd.load_nt("./tdb_" + dbName, file)
50 |             status, out = commands.getstatusoutput('cp {}/* {}'.format("./tdb_" + dbName, os.path.join(args.fuseki, "run/databases/") + dbName))
51 |             print(out)
52 |     # restart fuseki server
53 |     _, out = commands.getstatusoutput("netstat -tunlp|grep 3030")
54 |     uid = out.split()[-1].strip("/java')")
55 |     _, _ = commands.getstatusoutput("kill {}".format(uid))
56 |     subprocess.Popen(['sh', os.path.join(args.fuseki, 'fuseki-server')], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
57 |     time.sleep(10)
58 | 
59 |     sm = SilkCmd()
60 |     # delete old project and build new project
61 |     sm.control_project(project_name=args.projectName, action="DELETE")
62 |     sm.control_project(project_name=args.projectName)
63 |     # add prefixes to project
64 |     prefixes = {"baidu": "http://www.kgbaidu.com#",
65 |                 "hudong": "http://www.kghudong.com#"}
66 |     sm.add_prefix(prefixes)
67 |     # add Sparql endpoint datasets
68 |     for dname in hudong_name + baidu_name:
69 |         print(sm.build_endPoint(dname, "http://{}:3030/{}/query".format(args.ip, dname), "500000"))
70 | 
71 |     # build linking task
72 |     linking_rule = '<LinkageRule linkType="owl:sameAs"><Compare metric="levenshteinDistance" id="levenshteinDistance1" required="false" threshold="0.0" weight="1"><TransformInput function="lowerCase" id="lowerCase1"><Input path="baidu:lemmas_disambi" id="sourcePath1"/></TransformInput><TransformInput function="lowerCase" id="lowerCase2"><Input path="hudong:lemmas_disambi" id="targetPath1"/></TransformInput><Param name="minChar" value="0"/><Param name="maxChar" value="z"/></Compare><Filter/></LinkageRule>'
73 |     for hname in hudong_name:
74 |         for bname in baidu_name:
75 |             # odata_file means output rdf file
76 |             # o_rdf is the name of output dataset name corresponding to odata_file
77 |             odata_file = "o_" + hname + bname + ".nt"
78 |             o_rdf = "d_" + hname + bname
79 |             task_name = "t_" + hname + bname
80 |             sm.build_output(odata_file)
81 |             sm.build_rdf(o_rdf, odata_file)
82 |             sm.build_task(project_name=args.projectName, task_name=task_name, source_data=bname, target_data=hname, output_data=o_rdf)
83 | 
84 |     for hname in hudong_name:
85 |         for bname in baidu_name:
86 |             odata_file = "o_" + hname + bname + ".nt"
87 |             o_rdf = "d_" + hname + bname
88 |             task_name = "t_" + hname + bname
89 |             print(sm.add_rule(linking_rule, project_name=args.projectName, task_name=task_name))
90 |             print("task_name: ", type(task_name), task_name)
91 |             sm = SilkCmd()
92 |             print(sm.control_linking(project_name=args.projectName, task_name=task_name))
93 |             time.sleep(60*60*5)
94 | 


--------------------------------------------------------------------------------
/requirement.text:
--------------------------------------------------------------------------------
1 | BeautifulSoup4
2 | chardet
3 | pymysql
4 | sparqlwrapper
5 | jieba
6 | refo
7 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/semantic_search/elasticsearch/data/__init__.py


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/query.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | from utils import views
 8 | 
 9 | if __name__ == '__main__':
10 |     while True:
11 |         question = input()
12 |         answer = views.search(question.encode('utf-8'))
13 |         print("Your question is : ", question, "\nAnswer: ", answer)
14 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myhhub/KnowledgeGraph/035119b7a02ff8e79ec1e2c5ec7b44323d7fd192/semantic_search/elasticsearch/utils/__init__.py


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/build_dict.py:
--------------------------------------------------------------------------------
 1 | import ahocorasick
 2 | import pickle
 3 | from collections import defaultdict
 4 | 
 5 | entity_list_file = './data/all_entity.txt'
 6 | entity_out_path = './data/ent_ac.pkl'
 7 | attr_list_file = './data/attr_mapping.txt'
 8 | attr_out_path = './data/attr_ac.pkl'
 9 | val_list_file = './data/Person_val.txt'
10 | 
11 | 
12 | def dump_ac_entity_dict(list_file, out_path):
13 |     A = ahocorasick.Automaton()
14 |     f = open(list_file)
15 |     i = 0
16 |     for line in f:
17 |         word = line.strip()
18 |         A.add_word(word, (i, word))
19 |         i += 1
20 |     A.make_automaton()
21 |     pickle.dump(A, open(out_path, "wb"))
22 | 
23 | 
24 | def dump_ac_attr_dict(attr_mapping_file, out_path):
25 |     A = ahocorasick.Automaton()
26 |     f = open(attr_mapping_file)
27 |     i = 0
28 |     for line in f:
29 |         parts = line.strip().split(" ")
30 |         for p in parts:
31 |             if p != "":
32 |                 A.add_word(p, (i, p))
33 |                 i += 1
34 |     A.make_automaton()
35 |     pickle.dump(A, open(out_path, 'wb'))
36 | 
37 | 
38 | def load_ac_dict(out_path):
39 |     A = pickle.load(open(out_path, "rb"))
40 |     return A
41 | 
42 | 
43 | def load_attr_map(attr_mapping_file):
44 |     f = open(attr_mapping_file)
45 |     mapping = defaultdict(list)
46 |     for line in f:
47 |         parts = line.strip().split(" ")
48 |         for p in parts:
49 |             if p != '':
50 |                 mapping[p].append(parts[0])
51 |     return mapping
52 | 
53 | 
54 | def load_entity_dict(entity_file):
55 |     f = open(entity_file)
56 |     ents = {}
57 |     for line in f:
58 |         ents[line.strip()] = 1
59 |     return ents
60 | 
61 | 
62 | def load_val_dict(val_file):
63 |     f = open(val_file)
64 |     val_attr_map = {}
65 |     for line in f:
66 |         parts = line.strip().split(" ")
67 |         if line == "\n" or len(parts) < 2:
68 |             continue
69 |         new_str = u" ".join(parts[0:len(parts) - 1]).encode('utf-8')
70 |         val_attr_map[u" ".join(parts[0:len(parts) - 1]).encode('utf-8')] = parts[-1]
71 |     return val_attr_map
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     dump_ac_attr_dict(attr_list_file, attr_out_path)
76 |     # load_val_dict(val_list_file)
77 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/get_ac_attr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | import ahocorasick
 4 | import pickle
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | def dump_ac_attr_dict(attr_mapping_file='../data/attr_mapping.txt', out_path='../data/attr_ac.pkl'):
 9 |     A = ahocorasick.Automaton()
10 |     f = open(attr_mapping_file,'r',encoding='UTF-8')
11 |     i = 0
12 |     for line in f:
13 |         parts = line.strip().split(" ")
14 |         for p in parts:
15 |             if p != "":
16 |                 A.add_word(p, (i, p))
17 |                 i += 1
18 |     A.make_automaton()
19 |     pickle.dump(A, open(out_path, 'wb'))
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     dump_ac_attr_dict()
24 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/get_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | try:
 8 |     import simplejson as json
 9 | except:
10 |     import json
11 | 
12 | import pymysql
13 | from pymysql import connections
14 | from collections import defaultdict
15 | 
16 | 
17 | class connec_mysql(object):
18 |     def __init__(self):
19 |         self.conn = pymysql.connect(
20 |             host='localhost',
21 |             user='root',
22 |             passwd='root',
23 |             db='baidu_baike',
24 |             charset='utf8mb4',
25 |             use_unicode=True
26 |         )
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def select_from_db(self, target_item, target_table, target_condition, target_value):
30 |         self.cursor.execute("SELECT %s FROM %s WHERE %s = %s",
31 |                             (target_item, target_table, target_condition, target_value))
32 |         result = self.cursor.fetchall()
33 |         return result
34 | 
35 |     def get_json(self):
36 |         for cate in ["actor", "movie"]:
37 |             cate = cate.strip()
38 |             self.cursor.execute("SELECT MAX({}_id) FROM {}".format(cate, cate))
39 |             result = self.cursor.fetchall()
40 |             max_id = result[0][0] if result[0][0] != None else 0
41 |             print("max_id: ", max_id)
42 |             f = open("{}.json".format(cate), "w+")
43 |             for id in range(1, max_id + 1):
44 |                 self.cursor.execute("SELECT * FROM {} WHERE {}_id = {}".format(cate, cate, id))
45 |                 item_lists = self.cursor.fetchall()
46 |                 #                self.cursor.execute("SELECT COLUMN FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}'".format(cate))
47 |                 actor_column_attr = ["actor_id", "actor_bio", "actor_chName", "actor_foreName", "actor_nationality",
48 |                                      "actor_constellation", "actor_birthPlace", "actor_birthDay", "actor_repWorks",
49 |                                      "actor_achiem", "actor_brokerage"]
50 |                 movie_column_attr = ["movie_id", "movie_bio", "movie_chName", "movie_foreName", "movie_prodTime",
51 |                                      "movie_prodCompany", "movie_director", "movie_screenwriter", "movie_genre",
52 |                                      "movie_star", "movie_length", "movie_rekeaseTime", "movie_language",
53 |                                      "movie_achiem"]
54 |                 column_attr = actor_column_attr if cate == "actor" else movie_column_attr
55 | 
56 |                 if item_lists == None and column_attr == None:
57 |                     continue
58 |                 try:
59 |                     assert len(item_lists[0]) == 14 or len(item_lists[0]) == 11
60 |                     item_dict = defaultdict(list)
61 |                     item_dict["subj"] = str(item_lists[0][2])
62 |                     list_po = []
63 |                     for i in range(1, len(item_lists[0])):
64 |                         if column_attr[i] == "{}_chName".format(cate):  # skip actor_chName
65 |                             continue
66 |                         tmp_dict = {}
67 |                         tmp_dict["pred"] = column_attr[i]
68 |                         tmp_dict["obj"] = item_lists[0][i]
69 |                         list_po.append(tmp_dict)
70 |                     item_dict["po"] = list_po
71 |                     item_json = json.dumps(item_dict)
72 |                     f.write(item_json + "\n")
73 | 
74 |                 except Exception as e:
75 |                     print(e)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     connect_sql = connec_mysql()
80 |     connect_sql.get_json()
81 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/get_total_val.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | try:
 8 |     import simplejson as json
 9 | except:
10 |     import json
11 | 
12 | import pymysql
13 | from pymysql import connections
14 | from collections import defaultdict
15 | 
16 | 
17 | class connec_mysql(object):
18 |     def __init__(self):
19 |         self.conn = pymysql.connect(
20 |             host='localhost',
21 |             user='root',
22 |             passwd='root',
23 |             db='baidu_baike',
24 |             charset='utf8mb4',
25 |             use_unicode=True
26 |         )
27 |         self.cursor = self.conn.cursor()
28 | 
29 |     def get_json(self):
30 |         for cate in ["actor", "movie"]:
31 |             cate = cate.strip()
32 |             self.cursor.execute("SELECT MAX({}_id) FROM {}".format(cate, cate))
33 |             result = self.cursor.fetchall()
34 |             max_id = result[0][0] if result[0][0] != None else 0
35 |             print("max_id: ", max_id)
36 |             f = open("../data/{}.txt".format(cate), "w+",encoding='utf-8')
37 |             for id in range(1, max_id + 1):
38 |                 self.cursor.execute("SELECT * FROM {} WHERE {}_id = {}".format(cate, cate, id))
39 |                 item_lists = self.cursor.fetchall()
40 |                 #                self.cursor.execute("SELECT COLUMN FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}'".format(cate))
41 |                 actor_column_attr = ["actor_id", "actor_bio", "actor_chName", "actor_foreName", "actor_nationality",
42 |                                      "actor_constellation", "actor_birthPlace", "actor_birthDay", "actor_repWorks",
43 |                                      "actor_achiem", "actor_brokerage"]
44 |                 movie_column_attr = ["movie_id", "movie_bio", "movie_chName", "movie_foreName", "movie_prodTime",
45 |                                      "movie_prodCompany", "movie_director", "movie_screenwriter", "movie_genre",
46 |                                      "movie_star", "movie_length", "movie_rekeaseTime", "movie_language",
47 |                                      "movie_achiem"]
48 |                 column_attr = actor_column_attr if cate == "actor" else movie_column_attr
49 | 
50 |                 if item_lists == None and column_attr == None:
51 |                     continue
52 |                 try:
53 |                     assert len(item_lists[0]) == 14 or len(item_lists[0]) == 11
54 |                     for i in range(1, len(item_lists[0])):
55 |                         if item_lists[0][i] == 'None':
56 |                             continue
57 |                         f.write(item_lists[0][i] + " " + column_attr[i] + "\n")
58 | 
59 |                 except Exception as e:
60 |                     print(e)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     connect_sql = connec_mysql()
65 |     connect_sql.get_json()
66 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/insert.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | '''
 3 | 将一个知识图谱中的数据导入elastic search,须提前新建index和type
 4 | '''
 5 | try:
 6 |     import simplejson as json
 7 | except:
 8 |     import json
 9 | import sys
10 | import requests
11 | 
12 | def bulk_insert(base_url, data):
13 |     response = requests.post(base_url, headers={"Content-Type":"application/x-ndjson"}, data=data)
14 | 
15 | def begin_insert_job(index_name, type_name, json_filepath, bulk_size=1000):
16 |     base_url = "http://localhost:9200/" + index_name + "/" + type_name + "/_bulk"
17 |     f = open(json_filepath)
18 |     cnt, es_id = 0, 1
19 |     data = ""
20 |     for line in f:
21 |         action_meta = '{"index": {"_id":"' + str(es_id) + '"}}'
22 |         data = data + action_meta + "\n" + line
23 | 
24 |         es_id += 1
25 |         cnt += 1
26 |         if cnt >= bulk_size:
27 |             bulk_insert(base_url, data)
28 |             cnt, data = 0, ""
29 |         if not (es_id % bulk_size):
30 |             print(es_id)
31 |     if cnt:
32 |         bulk_insert(base_url, data)
33 | 
34 | if __name__ == '__main__':
35 |     begin_insert_job("demo", "_doc", "../data/baidu_baike.json")
36 | 


--------------------------------------------------------------------------------
/semantic_search/elasticsearch/utils/query_cmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | curl -XGET 'localhost:9200/demo/baidu_baike/_search?&pretty' -H 'Content-Type:application/json' -d'
 4 | {
 5 |     "query":{
 6 |         "bool":{
 7 |             "filter":{
 8 |                 "term":{"subj":"朱一龙"}
 9 |             }
10 |         }
11 |     }      
12 | }
13 | '
14 | 
15 | 


--------------------------------------------------------------------------------