├── 2048
    └── 2048.py
├── .gitignore
├── crawl
    ├── 暨南大学新闻爬虫
    │   ├── jnuxshc
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── items.cpython-35.pyc
    │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   └── settings.cpython-35.pyc
    │   │   ├── spiders
    │   │   │   ├── __pycache__
    │   │   │   │   ├── xzhc.cpython-35.pyc
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   └── csv_item_exporter.cpython-35.pyc
    │   │   │   ├── __init__.py
    │   │   │   ├── csv_item_exporter.py
    │   │   │   └── xzhc.py
    │   │   ├── pipelines.py
    │   │   ├── items.py
    │   │   ├── settings.py
    │   │   └── middlewares.py
    │   ├── main.py
    │   ├── scrapy.cfg
    │   └── readme.md
    ├── news
    │   └── news_crawl
    │   │   ├── crawl
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── items.cpython-35.pyc
    │   │       │   ├── __init__.cpython-35.pyc
    │   │       │   ├── pipelines.cpython-35.pyc
    │   │       │   └── settings.cpython-35.pyc
    │   │       ├── spiders
    │   │       │   ├── __pycache__
    │   │       │   │   ├── __init__.cpython-35.pyc
    │   │       │   │   └── newsspider.cpython-35.pyc
    │   │       │   ├── __init__.py
    │   │       │   └── newsspider.py
    │   │       ├── maziclib
    │   │       │   ├── __pycache__
    │   │       │   │   └── news_fun.cpython-35.pyc
    │   │       │   └── news_fun.py
    │   │       ├── items.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── middlewares.py
    │   │   ├── readme.md
    │   │   ├── main.py
    │   │   ├── scrapy.cfg
    │   │   └── docs
    │   │       ├── netease
    │   │           ├── 20160602
    │   │           │   └── BOIMS8PF00014JB5.json
    │   │           ├── 20160721
    │   │           │   └── BSH7V8QF00014JB6.json
    │   │           ├── 20180116
    │   │           │   └── D897H80K0001899O.json
    │   │           ├── 20180119
    │   │           │   ├── D8HD3PFD0001875P.json
    │   │           │   ├── D8HLN6QA0001875P.json
    │   │           │   ├── D8H1O67B0001899N.json
    │   │           │   ├── D8HBI8IF0001875P.json
    │   │           │   ├── D8HJ2GAK000187VE.json
    │   │           │   ├── D8HAH1VS0001875P.json
    │   │           │   ├── D8HIR5JP0001875P.json
    │   │           │   ├── D8HJ6VRF0001875O.json
    │   │           │   └── D8GOCKJU0001899N.json
    │   │           └── 20180120
    │   │           │   ├── D8J1VDAJ0001875P.json
    │   │           │   └── D8IUD7L60001899O.json
    │   │       └── tencent
    │   │           ├── 20160418
    │   │               └── 023091.json
    │   │           ├── 20161227
    │   │               ├── 012771.json
    │   │               ├── 014055.json
    │   │               ├── 007056.json
    │   │               ├── 012170.json
    │   │               └── 011065.json
    │   │           ├── 20171009
    │   │               └── 039986.json
    │   │           ├── 20171129
    │   │               └── 013590.json
    │   │           └── 20180120
    │   │               ├── 006763.json
    │   │               ├── 002903.json
    │   │               ├── 004328.json
    │   │               ├── 003365.json
    │   │               ├── 010551.json
    │   │               ├── 006769.json
    │   │               ├── 010301.json
    │   │               ├── 009612.json
    │   │               └── 004124.json
    ├── 简书首页爬虫
    │   ├── tutotial
    │   │   ├── tutotial
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   └── settings.cpython-35.pyc
    │   │   │   ├── spiders
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   └── __init__.cpython-35.pyc
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── exampleSpider.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── items.py
    │   │   │   ├── settings.py
    │   │   │   └── middlewares.py
    │   │   ├── scrapy.cfg
    │   │   └── readme.md
    │   └── jian.csv
    ├── 百度提交关键词.py
    ├── baidu_search.py
    ├── search.py
    └── getImage.py
├── 机器学习入门
    ├── 无监督
    │   ├── readme.md
    │   ├── cluster
    │   │   ├── readme.md
    │   │   ├── kmeans.py
    │   │   └── city.txt
    │   └── decomposition
    │   │   ├── readme.md
    │   │   └── PCA.py
    ├── keras
    │   ├── load_exist_model.py
    │   ├── my_model.h5
    │   └── mnist.py
    ├── 强化学习
    │   ├── readme.md
    │   └── Flappy Bird.py
    ├── readme.md
    ├── tensorflow
    │   ├── prac2.py
    │   └── prac1.py
    ├── 监督
    │   ├── readme.md
    │   ├── 分类
    │   │   ├── Bayes.py
    │   │   ├── KNN.py
    │   │   ├── DecisionTree.py
    │   │   └── 人体运动状态信息评级.py
    │   └── 回归
    │   │   ├── prices.txt
    │   │   └── 房价预测.py
    ├── matplotlib使用.py
    ├── Numpy.py
    ├── label_propagation.py
    └── 标签传播算法(LP).py
├── python网络编程学习
    ├── chapter1.py
    ├── chapter2.py
    ├── chapter3.py
    ├── chapter4.py
    ├── chapter3-2.py
    ├── chapter3-3.py
    └── chapter2 find.py
├── .idea
    ├── dictionaries
    │   └── mazic.xml
    ├── vcs.xml
    ├── misc.xml
    ├── modules.xml
    └── PycharmStudy.iml
├── grammar
    ├── readme.md
    ├── list.py
    ├── dictionary.py
    ├── set.py
    ├── Classes.py
    ├── Numpy
    │   └── Arrays.py
    └── liaoxuefeng.py
├── README.md
├── ACM
    └── cf
    │   ├── 672A 字符串第n个数.py
    │   ├── 1A 简单数学.py
    │   ├── 675A.py
    │   ├── 227A 叉积.py
    │   ├── 227B.py
    │   ├── 208A 字符串.py
    │   ├── 675B 填格子.py
    │   └── 675E DP+greedy.py
├── OS平台编程
    ├── 遍历文件夹目录.py
    ├── 修改所有文件名字.py
    └── 自动调用程序.py
├── 泰迪杯尝试
    ├── readability....py
    ├── re过滤html标签.py
    ├── 去除换行+空格.py
    ├── bbs.py
    ├── 1.py
    ├── 数据爬取(未处理).py
    ├── 爬取相似URL
    │   ├── 3.所有小URL初步信息去标签.py
    │   ├── 2.从相似URL中下载内容.py
    │   └── 从主页获得相似URL初步可执行代码.py
    ├── README.md
    ├── pyquery取全体文本.py
    └── 数据爬取(去标签).py
├── data structure
    ├── quickSort.py
    └── bubble sort.py
├── xslt提取网页数据.py
├── 验证码处理
    ├── crack.py
    └── ascii.py
└── Try cocos
    └── HelloWorld.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.h5


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/readme.md:
--------------------------------------------------------------------------------
1 | 主目录在这里  
2 | 运行请在该目录调用`python3 main.py`
3 | 


--------------------------------------------------------------------------------
/机器学习入门/无监督/readme.md:
--------------------------------------------------------------------------------
1 | 无监督两大主要任务
2 | - 聚类 cluster
3 | - 降维 decomposition
4 | 


--------------------------------------------------------------------------------
/机器学习入门/keras/load_exist_model.py:
--------------------------------------------------------------------------------
1 | from keras.models import load_model
2 | model = load_model('my_model.h5')


--------------------------------------------------------------------------------
/python网络编程学习/chapter1.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter1.py


--------------------------------------------------------------------------------
/python网络编程学习/chapter2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter2.py


--------------------------------------------------------------------------------
/python网络编程学习/chapter3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3.py


--------------------------------------------------------------------------------
/python网络编程学习/chapter4.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter4.py


--------------------------------------------------------------------------------
/机器学习入门/keras/my_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/机器学习入门/keras/my_model.h5


--------------------------------------------------------------------------------
/python网络编程学习/chapter3-2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-2.py


--------------------------------------------------------------------------------
/python网络编程学习/chapter3-3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-3.py


--------------------------------------------------------------------------------
/.idea/dictionaries/mazic.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectDictionaryState">
2 |   <dictionary name="mazic" />
3 | </component>


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | 
4 | from scrapy import cmdline
5 | cmdline.execute("scrapy crawl xzhc".split())
6 | 


--------------------------------------------------------------------------------
/grammar/readme.md:
--------------------------------------------------------------------------------
1 | ### Some tutorial
2 | #### Stanford
3 | http://cs231n.github.io/python-numpy-tutorial/#python
4 | #### liao
5 | https://www.liaoxuefeng.com/


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/items.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/机器学习入门/强化学习/readme.md:
--------------------------------------------------------------------------------
 1 | # 强化学习(Reinforcement)
 2 | 根据环境学习不断调整,例如迷宫
 3 | 
 4 | ### MDP(马尔科夫过程)
 5 | - model-base
 6 | 
 7 | 
 8 | ### 蒙特卡洛强化学习
 9 | - model-free
10 | - 多次采样,取平均作为期望累计奖赏


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__pycache__/items.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/items.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__pycache__/pipelines.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/pipelines.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/xzhc.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/xzhc.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/机器学习入门/无监督/cluster/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### cluster
 3 | - K-means
 4 | - DBSCAN
 5 | - Gaussian Mixtures
 6 | - Birch
 7 | 
 8 | 
 9 | ```python
10 | from sklearn.cluster import KMeans
11 | ```
12 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/grammar/list.py:
--------------------------------------------------------------------------------
1 | ## list (the same as array)
2 | nums = list(range(5))
3 | squares = [x**2 for x in nums]
4 | even_squares = [x**2 for x in nums if x%2==0]
5 | print(squares)
6 | print(even_squares)
7 | 
8 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/maziclib/__pycache__/news_fun.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/maziclib/__pycache__/news_fun.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/spiders/__pycache__/newsspider.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/spiders/__pycache__/newsspider.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/csv_item_exporter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/csv_item_exporter.cpython-35.pyc


--------------------------------------------------------------------------------
/机器学习入门/readme.md:
--------------------------------------------------------------------------------
1 | ### 学习视频
2 | >http://www.icourse163.org/course/BIT-1001872001  
3 | >https://www.bilibili.com/video/av17204303
4 | 
5 | ### numpy
6 | >http://cs231n.github.io/python-numpy-tutorial/#python
7 | 
8 | ### 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/maziclib/news_fun.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | 
4 | def ListCombiner(content):
5 |     string = ''
6 |     for e in content:
7 |         string += e
8 |     return string
9 | 


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (tensorflow)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/机器学习入门/tensorflow/prac2.py:
--------------------------------------------------------------------------------
1 | # 创建交互式会话
2 | import tensorflow as tf
3 | 
4 | sess = tf.InteractiveSession()
5 | a = tf.Variable([1.0,2.0])    # 变量数组
6 | b = tf.constant([3.0,4.0])    # 常量数组
7 | sess.run(tf.global_variables_initializer())
8 | ans = tf.add(a,b)
9 | print(ans.eval())


--------------------------------------------------------------------------------
/机器学习入门/无监督/decomposition/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### decomposition
 3 | - PCA (主成分,用鸢尾花的数据集)
 4 | - FastICA
 5 | - NMF (非负矩阵分解)
 6 | - LDA
 7 | 
 8 | ```python
 9 | from sklearn.decomposition import PCA
10 | from sklearn.datasets import load_iris
11 | ```
12 | 
13 | ### 应用
14 | - 例如,给一个人脸图片,然后提取特征
15 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | 
4 | from scrapy import cmdline
5 | #cmdline.execute("scrapy crawl netease_news_spider".split())
6 | #cmdline.execute("scrapy crawl tencent_news_spider".split())
7 | cmdline.execute("scrapy crawl sina_news_spider".split())
8 | 


--------------------------------------------------------------------------------
/crawl/百度提交关键词.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/16 19:35
 4 | # @Author  : mazicwong
 5 | # @File    : 百度提交关键词.py
 6 | 
 7 | import requests
 8 | kv = {'wd': 'python'}
 9 | r = requests.get("http://www.baidu.com/s", params=kv)
10 | print(len(r.text))
11 | 


--------------------------------------------------------------------------------
/机器学习入门/监督/readme.md:
--------------------------------------------------------------------------------
 1 | # 监督两大主要任务
 2 | 
 3 | ### 分类 (训练集,测试集)
 4 | ##### 指标
 5 | * 正确率: 针对预测结果, R=T/(T+F)
 6 | * 召回率: 针对原来样本, R=T/(T+F)
 7 | ##### 相关算法函数
 8 | * knn
 9 | * naivebayes
10 | * svm
11 | * decision tree
12 | * neural networks
13 | ##### 类别
14 | * 线性分类器
15 | * 非线性分类器
16 | 
17 | 
18 | ### 回归
19 | 


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/PycharmStudy.iml" filepath="$PROJECT_DIR$/.idea/PycharmStudy.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # It is some code about my Python Study
 2 | ### in Python 3.5
 3 | - **Grammar**
 4 | - **Code about ACM**
 5 |   - cf (py3.5)
 6 |   - zoj (py2.7)
 7 | - **data stucture**
 8 |   - bubble sort
 9 |   - KMP
10 | - **Spider**
11 | - **Algorithm about ML**
12 | - **Data mining**
13 | - **Machine Learning**
14 | 


--------------------------------------------------------------------------------
/grammar/dictionary.py:
--------------------------------------------------------------------------------
 1 | # dictionary (the same as map)
 2 | 
 3 | d = {'cat':'cute', 'dog':'furry'}
 4 | print(d['cat'])
 5 | for animal, type in d.items():
 6 |     print('A %s is %s' % (animal,type))
 7 | 
 8 | 
 9 | nums = list(range(5))
10 | even_num_to_square = {x:x**2 for x in nums if x%2==0}
11 | print(even_num_to_square)


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jnuxshc.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jnuxshc
12 | 


--------------------------------------------------------------------------------
/ACM/cf/672A 字符串第n个数.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 15:11
 4 | # @Author  : mazicwong
 5 | # @File    : 672A 字符串第n个数.py
 6 | 
 7 | '''
 8 | 字符串1234....
 9 | 打印字符串的第n个数
10 | '''
11 | k=int(input())
12 | n=''
13 | x=1
14 | while len(n)<1000:
15 |     n+=str(x)
16 |     x+=1
17 | print(n[k-1])


--------------------------------------------------------------------------------
/OS平台编程/遍历文件夹目录.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | import os
 4 | 
 5 | #打印所有文件的目录
 6 | path = input("输入一个需要打印的路径")
 7 | #os.walk 很常用,用来遍历一个目录,返回三元组  (路径,目录名,文件名)
 8 | for root, dirs, files in os.walk(path):
 9 |     for name in files:
10 |         print(os.path.join(root, name)) #os.path.join可以将路径和名字结合起来形成绝对路径
11 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl
12 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tutotial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutotial
12 | 


--------------------------------------------------------------------------------
/grammar/set.py:
--------------------------------------------------------------------------------
 1 | # set (the same as set in cpp)
 2 | 
 3 | from math import sqrt
 4 | 
 5 | animals = {'cat', 'dog'}
 6 | print('cat' in animals)
 7 | animals.add('fish') # not append in list(array)
 8 | 
 9 | for idx,animal in enumerate(animals):
10 |     print('#%d %s' % (idx,animal))
11 | 
12 | nums = {int(sqrt(x)) for x in range(30)}
13 | print(nums)


--------------------------------------------------------------------------------
/机器学习入门/tensorflow/prac1.py:
--------------------------------------------------------------------------------
 1 | # 做矩阵乘法
 2 | import tensorflow as tf
 3 | 
 4 | mat1 = tf.constant([[3.,3.]])    # 1*2矩阵
 5 | mat2 = tf.constant([[2.],[2.]])  # 2*1矩阵
 6 | product = tf.matmul(mat1,mat2)   # 创建op执行两个矩阵的乘法
 7 | 
 8 | sess = tf.Session()              # 在Session中执行图
 9 | ans = sess.run(product)          # 在图中执行op操作
10 | 
11 | print(ans)
12 | sess.close()


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class JnuxshcPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/OS平台编程/修改所有文件名字.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | import os
 4 | 
 5 | path = input("输入一个路径")
 6 | for root, dirs, files in os.walk(path):
 7 |     for name in files:
 8 |         fname, fext = os.path.splitext(name)  # 用splitext分割文件名和扩展名
 9 |         os.rename(os.path.join(root, name), \
10 |                   os.path.join(root, 'hdu ' + fname + fext))
11 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TutotialPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/readability....py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/24 23:27
 4 | # @Author  : mazicwong
 5 | # @File    : readability....py
 6 | 
 7 | import requests
 8 | from readability import Document
 9 | response = requests.get('http://www.bbsmax.com/A/kmzLB4DX5G/')
10 | doc = Document(response.text)
11 | print (doc.title())
12 | print (doc.summary())


--------------------------------------------------------------------------------
/crawl/baidu_search.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | try:
 4 |     kv = {'wd': 'Python'}
 5 |     r = requests.get("http://www.baidu.com/s", params=kv)
 6 |     print(r.encoding)
 7 |     r.raise_for_status()
 8 |     r.enconding = r.apparent_encoding
 9 |     print(r.enconding)
10 |     print("length of the whole source code : %s " %len(r.text))
11 | except:
12 |     print( "there must be a wrong")
13 | 


--------------------------------------------------------------------------------
/data structure/quickSort.py:
--------------------------------------------------------------------------------
 1 | def quicksort(arr):
 2 |     if len(arr)<=1:
 3 |         return arr
 4 |     pivot = arr[len(arr)//2]
 5 |     left = [x for x in arr if x < pivot]
 6 |     right = [x for x in arr if x > pivot]
 7 |     middle = [x for x in arr if x == pivot]
 8 |     return quicksort(left) + middle + quicksort(right)
 9 | 
10 | if __name__ == '__main__':
11 |     print(quicksort([3,6,7,9,1,3,1]))


--------------------------------------------------------------------------------
/机器学习入门/监督/分类/Bayes.py:
--------------------------------------------------------------------------------
 1 | # 朴素贝叶斯: 生成学习方法
 2 | # 学习联合概率分布,求后验概率分布
 3 | # 参数
 4 | # priors: 先验概率
 5 | 
 6 | import numpy as np
 7 | from sklearn.naive_bayes import GaussianNB #朴素bayes
 8 | X = np.array([[-1,-1], [-1,-1], [-3,-2], [1,1], [2,1], [3,2]])
 9 | y = np.array([1,1,1,2,2,2])
10 | 
11 | #训练
12 | clf = GaussianNB(priors=None) #默认参数,创建分类器
13 | clf.fit(X,y)
14 | 
15 | #预测
16 | print(clf.predict([[-0.8,-1]]))
17 | 


--------------------------------------------------------------------------------
/data structure/bubble sort.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | 
 4 | # this is a simple Python bubble sort
 5 | 
 6 | array = [1, 2, 3, 5, 4, 6, 9, 8, 7]
 7 | for i in range(len(array) - 1, 0, -1):
 8 |     for j in range(0, i):
 9 |         if array[j] > array[j + 1]:
10 |             array[j], array[j + 1] = array[j + 1], array[j]
11 |             # so cool compared to CPP...
12 | 
13 | print(array)


--------------------------------------------------------------------------------
/grammar/Classes.py:
--------------------------------------------------------------------------------
 1 | class Greeter(object):
 2 |     # Constructor
 3 |     def __init__(self, name):
 4 |         self.name = name
 5 |     # Instance method (实例方法)
 6 |     def greet(self, loud = False):
 7 |         if loud:
 8 |             print('Hello, %s !' % self.name.upper())
 9 |         else:
10 |             print('Hello, %s' % self.name)
11 | 
12 | g = Greeter('Fred')
13 | g.greet()
14 | g.greet(loud=True)
15 | 


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item,Field
 9 | 
10 | 
11 | class JnuxshcItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = Field()
15 |     time = Field()
16 |     intro = Field()
17 | 
18 | 


--------------------------------------------------------------------------------
/机器学习入门/监督/分类/KNN.py:
--------------------------------------------------------------------------------
 1 | # KNN,取与已知点最近的k个点,看占据哪个类别的比例多
 2 | # 参数
 3 | # n_neighbors:  K(默认5)
 4 | # weights: K个点对结果的影响权重(默认平均权重uniform)
 5 | # algorithm: 计算临近点方法(ball_tree,kd_tree,brute)
 6 | #
 7 | 
 8 | from sklearn.neighbors import KNeighborsClassifier
 9 | from sklearn.datasets import load_iris
10 | 
11 | # 训练
12 | X = [[0],[1],[2],[3]]
13 | y = [0,0,1,1]
14 | clf = KNeighborsClassifier(n_neighbors=3) # k=3
15 | clf.fit(X,y) # 学习
16 | 
17 | # 使用
18 | print(clf.predict([[1.1]]))
19 | 


--------------------------------------------------------------------------------
/ACM/cf/1A 简单数学.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 0:53
 4 | # @Author  : mazicwong
 5 | # @File    : 1A 简单数学.py
 6 | 
 7 | '''
 8 | give : n,m,a     a retangle with n*m and use how many square with a*a to patch up with it
 9 | (can be overlap)
10 | http://blog.csdn.net/chenguolinblog/article/details/12190689
11 | '''
12 | 
13 | myList = input().split()
14 | n=int(myList[0])
15 | m=int(myList[1])
16 | a=int(myList[2])
17 | 
18 | print((n//a+(n%a>0))*(m//a+(m%a>0)))


--------------------------------------------------------------------------------
/grammar/Numpy/Arrays.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | a = np.array([1,2,3])
 4 | print(type(a))  # all is same type
 5 | print(a.shape)  # the shape of an array is a tuple of integers giving the size of the array along each dimension.
 6 | print(a[0],a[1])
 7 | 
 8 | b = np.array([[1,2,3],[4,5,6]])
 9 | print(b.shape)
10 | 
11 | # functions to create array
12 | c = np.zeros((2,2))
13 | d = np.ones((1,2))
14 | e = np.full((2,2), 7)
15 | f = np.eye(2) # identity matrix
16 | g = np.random.rand((2,2))  #random value    


--------------------------------------------------------------------------------
/ACM/cf/675A.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 15:16
 4 | # @Author  : mazicwong
 5 | # @File    : 675A.py
 6 | '''
 7 | 给出a，b，c，求是否a加若干个c能得到b，是就输出YES，否就输出NO
 8 | 解答： (b-a)%c==0
 9 | '''
10 | 
11 | a, b, c = map(int, input().split(' '))
12 | if ((a != b and c == 0) or (b > a and c < 0)):
13 |     print("NO")
14 | elif ((a == b) or (b > a and c > 0 and ((b - a) % c == 0)) or (a > b and c < 0 and ((a - b) % c == 0))):
15 |     print("YES")
16 | else:
17 |     print("NO")
18 | 


--------------------------------------------------------------------------------
/xslt提取网页数据.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/12 16:39
 4 | # @Author  : mazicwong
 5 | # @File    : xslt提取网页数据.py
 6 | # https://zhuanlan.zhihu.com/p/20869884
 7 | 
 8 | '''
 9 | https://zhuanlan.zhihu.com/p/20869884
10 | lxml是python的一个库，可以迅速、灵活地处理 XML。
11 | 提取集搜客官网旧版论坛的帖子标题和回复数，把整个列表提取出来，存成xml格式
12 | '''
13 | from urllib.request import urlopen
14 | from lxml  import etree
15 | url="http://www.gooseeker.com/cn/forum/7"
16 | html = urlopen(url)
17 | doc=etree.HTML(html.read())


--------------------------------------------------------------------------------
/ACM/cf/227A 叉积.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 12:59
 4 | # @Author  : mazicwong
 5 | # @File    : 227A 叉积.py
 6 | 
 7 | '''
 8 | 本题输入三个点坐标，考察叉积，若大于0则right，小于0则left，等于0则towards
 9 | '''
10 | 
11 | ax,ay = map(int,input().split(' '))
12 | bx,by = map(int,input().split(' '))
13 | cx,cy = map(int,input().split(' '))
14 | x1=ax-bx
15 | y1=cx-bx
16 | x2=ay-by
17 | y2=cy-by
18 | ans=x1*y2-x2*y1
19 | if ans>0:
20 |     print("RIGHT")
21 | elif ans<0:
22 |     print("LEFT")
23 | else:
24 |     print("TOWARDS")


--------------------------------------------------------------------------------
/ACM/cf/227B.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 15:00
 4 | # @Author  : mazicwong
 5 | # @File    : 227B.py
 6 | 
 7 | '''
 8 | n
 9 | a1,a2...an
10 | q
11 | b1,b2...bq
12 | '''
13 | n = int(input())
14 | mylist = input().split(' ')
15 | i = 0
16 | zid = {}
17 | for x in mylist:
18 |     zid[x] = i
19 |     i += 1
20 | q = int(input())
21 | m = input().split(' ')
22 | ans1 = 0
23 | ans2 = 0
24 | for y in m:
25 |     tmp = zid[y]
26 |     ans1 += tmp + 1
27 |     ans2 += n - tmp
28 | print(ans1, ans2)
29 | 


--------------------------------------------------------------------------------
/OS平台编程/自动调用程序.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | import sched, time
 4 | 
 5 | 
 6 | # sched的函数不超过10个,但都很好用
 7 | def print_time(msg='default'):
 8 |     print("当前时间", time.time(), msg)
 9 | 
10 | 
11 | # sched.scheduler() 用来创建一个调度任务
12 | s = sched.scheduler(time.time, time.sleep)
13 | print(time.time())
14 | s.enter(5, 1, print_time, argument=('延迟5秒,优先级1',))  # 时间间隔,执行优先级,调用的函数,函数参数
15 | s.enter(3, 2, print_time, argument=('延迟3秒,优先级2',))
16 | s.enter(3, 1, print_time, argument=('延迟3秒,优先级1',))
17 | s.run()  # 执行调度事件
18 | print(time.time())
19 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/readme.md:
--------------------------------------------------------------------------------
 1 | 无聊复习下爬虫,用scrapy爬取了简书的热门文章,后面可以继续添加内容
 2 | 
 3 | `scrapy startproject tutotial`之后，要用到的就是进sina里面修改了  
 4 | 其中文件:
 5 | - items.py: 修改为需要获得的数据
 6 | - pipelines.py: 不管
 7 | - settings.py: 设置了获取数据储存的地方,修改`robots`,`user_agent`等
 8 | - middlewares.py: 
 9 | - spiders/: 真正爬虫代码,可以用xpath,selector等处理,记得放入item中
10 | 
11 | 
12 | `scrapy crawl example.py`,spiders文件夹中爬虫代码
13 | 
14 | Some Problem:
15 | 1. 一开始运行完空白,看到debug中返回403,然后到settings.py里修改`user_agent`就好了
16 | 2. 然后运行完还是爬不到,在settings把robots.txt修改为False就好了
17 | 3. 第三个错误就是xpath写错的原因了,以后注意就行
18 | 


--------------------------------------------------------------------------------
/.idea/PycharmStudy.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.6 (tensorflow)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Unittests" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/机器学习入门/监督/分类/DecisionTree.py:
--------------------------------------------------------------------------------
 1 | # 决策树
 2 | # 参数
 3 | # criterion: gini(基尼系数)/entropy(信息增益)
 4 | # max_features: 节点处分裂时,从多少个特征选择最优特征,默认使用所有特征个数
 5 | 
 6 | from sklearn.datasets import load_iris
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | clf = DecisionTreeClassifier() #默认gini
11 | iris = load_iris()
12 | data = iris.data      # 数据
13 | target = iris.target  # 标签作为目标结构
14 | 
15 | #训练
16 | # 10则交叉验证
17 | cross_val_score(clf, iris.data, iris.target, cv=10)
18 | clf.fit(X,y)
19 | 
20 | #预测
21 | print(clf.predict(X))


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item,Field
 9 | 
10 | 
11 | class TutotialItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = Field()
15 |     author = Field()
16 |     time = Field()
17 |     '''
18 |     url = Field()
19 |     readNum = Field()
20 |     commentNum = Field()
21 |     likeNum = Field()
22 |     '''
23 | 


--------------------------------------------------------------------------------
/python网络编程学习/chapter2 find.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/12 16:13
 4 | # @Author  : mazicwong
 5 | # @File    : chapter2 find.py
 6 | 
 7 | from urllib.request import urlopen
 8 | from bs4 import BeautifulSoup
 9 | 
10 | html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
11 | bs0bj = BeautifulSoup(html, "html.parser")
12 | # use findall to get a 'list' containing those only appeared in <span class="green"></span>
13 | nameList = bs0bj.findAll("span", {"class": "green"})
14 | for name in nameList:
15 |     print(name.get_text())
16 | 


--------------------------------------------------------------------------------
/ACM/cf/208A 字符串.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/21 22:44
 4 | # @Author  : mazicwong
 5 | # @File    : 208A 字符串.py
 6 | # 将原字符串中的“WUB”子串去掉
 7 | '''
 8 | input()输入string，如果要读一个数字的话，要用int（）转为数字 int(input())
 9 | a = str.split(sss) 将原串按sss进行分割，然后存到的到子串存到一个集合当中
10 | eg: str ="a$b$c"  a = str.split('$') a=[a,'',b,'',c]
11 | '''
12 | 
13 | 
14 | print (input().replace('WUB', ' '))
15 | 
16 | '''
17 | str = input()
18 | str.encode('UTF-8')
19 | a = []
20 | a = str.split('WUB')
21 | for t in a:
22 |     if t != '':
23 |         print(t, end=' ')#print默认\n结尾，给换成空格就好
24 | '''
25 | 
26 | 


--------------------------------------------------------------------------------
/ACM/cf/675B 填格子.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 15:29
 4 | # @Author  : mazicwong
 5 | # @File    : 675B 填格子.py
 6 | 
 7 | '''
 8 | 有个3*3的九宫格，每个格子能填1~n中任意的数（n由输入给出）。要求其中任意2*2的格子中4个数的和与其他各个2*2格子都相等
 9 | 解法： 对中间的数进行枚举
10 | '''
11 | 
12 | 
13 | def solve():
14 |     n, a, b, c, d = map(int, input().split())
15 |     ans = 0
16 |     for i in range(1, n + 1):
17 |         t = i + a + b
18 |         if t - a - c > 0 and t - a - c <= n and t - c - d > 0 and t - c - d <= n and t - b - d > 0 and t - b - d <= n:
19 |             ans += 1
20 |     return ans * n
21 | 
22 | 
23 | print(solve())
24 | 


--------------------------------------------------------------------------------
/机器学习入门/监督/回归/prices.txt:
--------------------------------------------------------------------------------
 1 | 1000,168
 2 | 792,184
 3 | 1260,197
 4 | 1262,220
 5 | 1240,228
 6 | 1170,248
 7 | 1230,305
 8 | 1255,256
 9 | 1194,240
10 | 1450,230
11 | 1481,202
12 | 1475,220
13 | 1482,232
14 | 1484,460
15 | 1512,320
16 | 1680,340
17 | 1620,240
18 | 1720,368
19 | 1800,280
20 | 4400,710
21 | 4212,552
22 | 3920,580
23 | 3212,585
24 | 3151,590
25 | 3100,560
26 | 2700,285
27 | 2612,292
28 | 2705,482
29 | 2570,462
30 | 2442,352
31 | 2387,440
32 | 2292,462
33 | 2308,325
34 | 2252,298
35 | 2202,352
36 | 2157,403
37 | 2140,308
38 | 4000,795
39 | 4200,765
40 | 3900,705
41 | 3544,420
42 | 2980,402
43 | 4355,762
44 | 3150,392
45 | 


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/csv_item_exporter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | #用来规定输出到csv顺序的
 5 | from scrapy.conf import settings
 6 | from scrapy.contrib.exporter import CsvItemExporter
 7 | 
 8 | class MyProjectCsvItemExporter(CsvItemExporter):
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         delimiter = settings.get('CSV_DELIMITER', ',')
12 |         kwargs['delimiter'] = delimiter
13 | 
14 |         fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
15 |         if fields_to_export :
16 |             kwargs['fields_to_export'] = fields_to_export
17 | 
18 |         super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
19 | 


--------------------------------------------------------------------------------
/crawl/search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | import requests
 4 | kv = {'wd':'Python'}
 5 | q = requests.get("http://www.baidu.com/s",params = kv)
 6 | 
 7 | q.status_code
 8 | def get_URL(url):
 9 |     try:
10 |         r=requests.get(url,timeout=30)
11 |         print(r.encoding)
12 |         r.raise_for_status()
13 |         r.enconding=r.apparent_encoding
14 |         return r.text[:1000]
15 |     except:
16 |         return "there must be a wrong"
17 | 
18 | if __name__=="__main__":
19 |     url="https://detail.tmall.com/item.htm?spm=a223c.8145724.1110321729.1.Qz7Kic&acm=lb-zebra-175981-1643283.1003.4.1365015&id=537259409492&scm=1003.4.lb-zebra-175981-1643283.ITEM_537259409492_1365015"
20 |     print(get_URL(url))
21 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item,Field
 9 | 
10 | class NewsItem(Item):
11 |     source = Field()
12 |     date = Field()
13 |     newsId = Field()
14 |     cmtId = Field()
15 |     contents = Field()
16 |     comments = Field()
17 | 
18 | class CrawlItem(Item):
19 |     # define the fields for your item here like:
20 |     # name = scrapy.Field()
21 |     pass
22 | 
23 | class NeteaseItem(NewsItem):
24 |     boardId = Field()
25 | 
26 | class TencentItem(NewsItem):
27 |     pass
28 | 
29 | class SinaItem(NewsItem):
30 |     channelId = Field()
31 | 


--------------------------------------------------------------------------------
/crawl/getImage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | import requests
 4 | import os
 5 | 
 6 | root = "D://pics//"
 7 | url = "http://imgsize.ph.126.net/?enlarge=true&imgurl=http://edu-image.nosdn.127.net/73946898DEFC4EEE8B934F5DA131B905.jpg?imageView&amp;thumbnail=426y240&amp;quality=100_230x130x1x95.png"
 8 | path = root + url.split('/')[-1]
 9 | try:
10 |     if not os.path.exists(root):
11 |         os.mkdir(root)
12 |     if not os.path.exists(path):
13 |         r = requests.get(url)
14 |         # 图片是二进制格式,把图片保存为文件
15 |         with open(path, 'wb') as f:
16 |             f.write(r.content)
17 |             f.close()
18 |             print("successfully saving")
19 |     else:
20 |         print ("The file is already existing")
21 | except:
22 |     print("a faulty operation ")


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### JNU学生荟萃板块爬虫
 3 |     *采用scrapy*
 4 | 	URL : https://news.jnu.edu.cn/xshc/ll
 5 | 
 6 | #### 使用方式
 7 |     `scrapy startproject jnuxshc`之后，要用到的就是进sina里面修改了  
 8 |     `scrapy crawl mazic.py`,spiders中爬虫代码,这里用`main.py`来执行了  
 9 |     *最终接口*,调用`python3 main.py`,会得到一个`jnu.csv`的文件
10 | 
11 | #### 需要修改的文件:
12 | 	- items.py: 修改为需要获得的数据
13 | 	- pipelines.py: 暂时不管
14 | 	- settings.py: 数据存储的地方和格式,修改`robots`,`user_agent`
15 | 	- middlewares.py: 暂时不管
16 | 	- spiders/***.py: 真正爬虫代码,可以用xpath,selector等处理,记得放入item中
17 | 
18 | 
19 | #### Some Problem:
20 | 	1. 一开始运行完空白,看到debug中返回403,然后到settings.py里修改`user_agent`就好了
21 | 	2. 然后运行完还是爬不到,在settings把robots.txt修改为False就好了
22 | 	3. 第三个错误就是xpath写错的原因了,以后注意就行
23 | 	4. 由于输出到csv的列是无序的,所以在spiders/中加了`csv_item_exporter.py`,在`settings.py`中添加了`FEED_EXPORTERS`和`FIELDS_TO_EXPORT`
24 | 
25 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/re过滤html标签.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/13 0:22
 4 | # @Author  : mazicwong
 5 | # @File    : re过滤html标签.py
 6 | 
 7 | 
 8 | 
 9 | from html.parser import HTMLParser
10 | from bs4 import BeautifulSoup
11 | from urllib import request
12 | class MLStripper(HTMLParser):
13 |     def __init__(self):
14 |         self.reset()
15 |         self.strict = False
16 |         self.convert_charrefs= True
17 |         self.fed = []
18 |     def handle_data(self, d):
19 |         self.fed.append(d)
20 |     def get_data(self):
21 |         return ''.join(self.fed)
22 | 
23 | def strip_tags(html):
24 |     s = MLStripper()
25 |     s.feed(html)
26 |     return s.get_data()
27 | 
28 | url = "http://x.heshuicun.com/forum.php?mod=viewthread&tid=80"
29 | html = request.urlopen(url)
30 | bsObj = BeautifulSoup(html)
31 | strip_tags(bsObj)


--------------------------------------------------------------------------------
/机器学习入门/强化学习/Flappy Bird.py:
--------------------------------------------------------------------------------
 1 | # Deep Q-Network
 2 | # 深度强化学习进行Flappy Bird游戏的训练
 3 | # tensorflow + pygame +cv2
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | from collections import deque
 8 | import random
 9 | import sys
10 | sys.path.append('/home/mazic/Downloads/FlappyBirdClone')
11 | # import wrapped_flappy_bird as game
12 | import cv2
13 | import pygame
14 | 
15 | GAME = 'bird'
16 | ACTIONS = 2
17 | GAMMA = 0.99
18 | OBSERVE = 10000.
19 | EXPLORE = 3000000.
20 | FINAL_EPSILON = 0.0001
21 | INITIAL_EPSILON = 0.0001
22 | REPLAY_MEMORY = 50000
23 | BATCH = 32
24 | FRAME_PER_ACTION = 1
25 | 
26 | mat1 = tf.constant([[3.,3.]])    # 1*2矩阵
27 | mat2 = tf.constant([[2.],[2.]])  # 2*1矩阵
28 | product = tf.matmul(mat1,mat2)   # 创建op执行两个矩阵的乘法
29 | sess = tf.Session()              # 在Session中执行图
30 | res = sess.run(product)          # 在图中执行op操作
31 | 
32 | print(res)
33 | sess.close()
34 | 


--------------------------------------------------------------------------------
/grammar/liaoxuefeng.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | 
 4 | print('this is', 'a learning', 'process')
 5 | name = input("input your name : ")
 6 | a = input("input your age : ")
 7 | age = int(a)
 8 | print('hello: %s , %d ' % (name, age))  # 用%中间不用逗号...it is not C
 9 | 
10 | # ****常用数据类型****#
11 | # list [] 数组 append(),insert(1,'ma'),pop(),
12 | classmates = ['mazic']
13 | classmates.append('cpp')
14 | print(classmates[-1])
15 | classmates.pop()
16 | L = list(range(100)) #共0~99
17 | L = L[3:10:2] #第三到第十个数,每两个取一次(这种切片操作可用于list,tuple,str)
18 | # tuple () 定长数组 =>就是比较安全而已
19 | 
20 | for name in classmates:  # for name in range(101)
21 |     print(name)
22 | for i,value in enumerate(['A','B','C']):
23 |     print(i,value)
24 | 
25 | # dict 字典(即map),一组key+value
26 | d = {'mazic': 100, 'java': 6, 'cpp': 99}
27 | # set 一组key,但是没有重复的key #add(3),remove(4)
28 | s = set([1, 2, 3])
29 | ss
30 | 
31 | #from 库 import 函数


--------------------------------------------------------------------------------
/泰迪杯尝试/去除换行+空格.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/28 0:15
 4 | # @Author  : mazicwong
 5 | # @File    : 去除换行+空格.py
 6 | 
 7 | '''
 8 | 源码编码判断用chardet，取出换行和空格用strip
 9 | '''
10 | 
11 | import re
12 | 
13 | # s = "as, \n asdas   \n       \n   \n  \n\nasda"
14 | # print(s)
15 | # print(".............")
16 | # s = ''.join(re.split(' +', s))
17 | # s = '\n'.join(re.split('\n+', s))
18 | # print(s)
19 | # print ('\n'.join(re.split(' +',s)))
20 | 
21 | 
22 | with open(r'C:\Users\ASUS\Desktop\66out-1.txt', 'r') as file:
23 |     str = file.read()
24 |     str = '\n'.join(re.split(' +', str))
25 |     str = '\n'.join(re.split('\t+', str))
26 |     str = '\n'.join(re.split('\r+', str))
27 |     str = '\n'.join(re.split('&nbsp', str))
28 |     str = '\n'.join(re.split('\n+', str))
29 |     print(str)
30 | 
31 | file1 = open(r'C:\Users\ASUS\Desktop\666-1.txt', 'w')
32 | file1.write(str)
33 | file1.close()
34 | 


--------------------------------------------------------------------------------
/验证码处理/crack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/29 16:21
 4 | # @Author  : mazicwong
 5 | # @File    : crack.py
 6 | 
 7 | #https://www.shiyanlou.com/courses/364
 8 | 
 9 | from PIL import Image
10 | im = Image.open("Code.png")
11 | im = im.convert("P") #converting an “RGB” image to an 8-bit palette image
12 | print (im.histogram()) #打印颜色直方图
13 | #发现很多白点，每个点是256色，最后一个显示920，说明有920个白色像素
14 | his = im.histogram()
15 | values={}
16 | for i in range(255):
17 |     values[i] = his[i]
18 | 
19 | #排序得到有用的颜色，发现 211，741  这个就是我们要的验证码的红色部分了
20 | for j,k in sorted(values.items(),key=lambda x:x[1],reverse = True)[:10]:
21 |     print(j,k)
22 | 
23 | 
24 | #构造黑白二值图片
25 | im2 = Image.new("P",im.size,255)
26 | 
27 | for x in range(im.size[1]):
28 |     for y in range(im.size[0]):
29 |         pix = im.getpixel((y,x))
30 |         if pix == 1 or pix ==2:
31 |             im2.putpixel((y,x),0)
32 | 
33 | im2.show()
34 | 
35 | 


--------------------------------------------------------------------------------
/机器学习入门/无监督/decomposition/PCA.py:
--------------------------------------------------------------------------------
 1 | # 主成分分析
 2 | # 矩阵的主成分即协方差矩阵对应的特征向量
 3 | # 对鸢尾花数据降维(4->2)
 4 | 
 5 | import matplotlib.pyplot as plt #可视化
 6 | import numpy as np
 7 | from sklearn.decomposition import PCA
 8 | from sklearn.datasets import load_iris #数据集
 9 | 
10 | data = load_iris()
11 | y = data.target
12 | X = data.data
13 | pca = PCA(n_components=2)
14 | reduced_X = pca.fit_transform(X)
15 | 
16 | red_x, red_y = [], []
17 | blue_x, blue_y = [], []
18 | green_x, green_y = [], []
19 | 
20 | 
21 | for i in range(len(reduced_X)):
22 |     if y[i] == 0:
23 |         red_x.append(reduced_X[i][0])
24 |         red_y.append(reduced_X[i][1])
25 |     elif y[i] == 1:
26 |         blue_x.append(reduced_X[i][0])
27 |         blue_y.append(reduced_X[i][1])
28 |     else:
29 |         green_x.append(reduced_X[i][0])
30 |         green_y.append(reduced_X[i][1])
31 | 
32 | plt.scatter(red_x, red_y, c='r', marker='x')
33 | plt.scatter(blue_x, blue_y, c='b', marker='D')
34 | plt.scatter(green_x, green_y, c='g', marker='.')
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/机器学习入门/无监督/cluster/kmeans.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.cluster import KMeans
 3 | 
 4 | def loadData(filepath):
 5 |     f = open(filepath,'r+')
 6 |     lines = f.readlines()
 7 |     retData = []
 8 |     retCityName = []
 9 |     for line in lines:
10 |         items = line.strip().split()
11 |         retCityName.append(items[0])
12 |         # retData.append([float(items[i])] for i in range(1,len(items)))
13 |         retData.append([float(items[i]) for i in range(1, len(items))])
14 |     return retData,retCityName
15 | 
16 | if __name__ == '__main__':
17 |     data,cityName = loadData('city.txt')
18 |     km = KMeans(n_clusters=4) # 聚类中心
19 |     label = km.fit_predict(data)  # 获取每一条数据的聚类标签
20 |     expenses = np.sum(km.cluster_centers_, axis=1)
21 |     CityCluster = [[], [], [], []]  # 城市按label分成簇
22 |     for i in range(len(cityName)):
23 |         CityCluster[label[i]].append(cityName[i])
24 |     for i in range(len(CityCluster)):
25 |         print("Expenses:%.2f" % expenses[i])
26 |         print(CityCluster[i])


--------------------------------------------------------------------------------
/Try cocos/HelloWorld.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/29 15:04
 4 | # @Author  : mazicwong
 5 | # @File    : HelloWorld.py
 6 | 
 7 | import cocos
 8 | 
 9 | class HelloWorld(cocos.layer.Layer):
10 |     def __init__(self):
11 |         super(HelloWorld,self).__init__()
12 | 
13 |         #新建文字标签用于显示helloworld
14 |         label = cocos.text.Label('Hello,world',
15 |                                  font_name  = 'Times New Roman',
16 |                                  font_size = 32,
17 |                                  anchor_x='center',
18 |                                  anchor_y='center'
19 |                                  )
20 |         label.position = 320,240
21 |         self.add(label)
22 | 
23 | cocos.director.director.init() #新建一个窗口
24 | main_scene = cocos.scene.Scene(HelloWorld())#新建场景，场景里只有一个层hello_layer
25 | cocos.director.director.run(main_scene) #开始工作
26 | 
27 | # class PPX(cocos.sprite.Sprite):
28 | #     def __init__(self):
29 | #         super(PPX,self).__init__('ppx.png')
30 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/bbs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/12 16:55
 4 | # @Author  : mazicwong
 5 | # @File    : bbs.py
 6 | # datas = file('result_sample.txt').readlines()
 7 | 
 8 | from urllib import request
 9 | from bs4 import BeautifulSoup
10 | import re
11 | 
12 | url = "http://x.heshuicun.com/forum.php?mod=viewthread&tid=80"
13 | headers = {
14 |     'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
15 |     'Referer': r'http://x.heshuicun.com/forum.php?mod=viewthread&tid=80',
16 | }
17 | req = request.Request(url, headers=headers)
18 | page = request.urlopen(req).read()
19 | # page = page.decode('utf-8')
20 | 
21 | 
22 | # html = urlopen(url)
23 | # page = html.read()
24 | # bs0bj = BeautifulSoup(html, "html.parser")
25 | # print(html)
26 | # pattern = re.compile(r"^\d{4}(-\d\d){2} \d\d(:\d\d){2}")
27 | # match = pattern.match('2015-05-22 17:43:50')
28 | # mmm = re.match(r"^\d{4}(-\d\d){2} \d\d(:\d\d){2}",page)
29 | # print (match.group())
30 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/jian.csv:
--------------------------------------------------------------------------------
 1 | title,time,author
 2 | 【陌生人】枫叶,2017-12-22T06:44:01+08:00,原始生命
 3 | 如何有效阅读一本书？（附思维导图）,2018-01-14T20:03:35+08:00,平白书
 4 | 周杰伦：等你下课，勇敢追梦,2018-01-18T07:45:10+08:00,潘城王小古
 5 | 请停止无效社交——移动互联网时代，如何建立自己的人脉关系,2018-01-18T08:02:33+08:00,萌薇
 6 | 一张画彻底改变了我的后半生，也可能改变你的,2018-01-18T12:08:20+08:00,心蓝丫头
 7 | 那些懂得有效学习的人，永远不会被社会淘汰,2018-01-18T08:19:23+08:00,Nicole林小白
 8 | 社会如此不公平，教你几种面对竞争的博弈方法,2018-01-18T10:21:57+08:00,道长是名思维贩子
 9 | 作为背包客，我是一个像空气一样自由的人,2018-01-18T11:09:31+08:00,有备而来的路人甲
10 | 《十二夜》：爱情所有的样子，这里都有,2018-01-11T16:33:22+08:00,南有南风
11 | 新年“剧”场｜琅琊风起，吸海垂虹,2018-01-13T00:51:36+08:00,覃浠
12 | 成长不是站在起点去选择，而是在过程中去把握,2018-01-18T15:48:09+08:00,韩大爷的杂货铺
13 | 僧人与屠夫,2018-01-12T08:49:55+08:00,从心活过
14 | 诗‖和平下的战争,2018-01-11T21:59:31+08:00,半岛雪
15 | 周杰伦《等你下课》了，能不能把青春还给我？,2018-01-18T08:51:37+08:00,衷曲无闻
16 | 初恋这件小事,2018-01-12T23:56:12+08:00,尊敬的王二
17 | 【古风】帝王的妻姐（47）,2018-01-13T07:00:28+08:00,无疾不伤
18 | 1、鸡场奇迹,2018-01-18T11:20:28+08:00,修道院羔羊
19 | 小程序学习笔记2-使用weui开发小程序,2018-01-12T15:50:17+08:00,Doris_Lee
20 | 岁生之初，且听我闲扯,2018-01-04T20:20:46+08:00,一浅疏影
21 | 二十多岁的我们，拥有多少存款？,2018-01-18T07:49:35+08:00,羊达令
22 | 


--------------------------------------------------------------------------------
/验证码处理/ascii.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/29 17:13
 4 | # @Author  : mazicwong
 5 | # @File    : ascii.py
 6 | 
 7 | 
 8 | from PIL import Image
 9 | 
10 | # 图片路径/名称
11 | path = "char1.png"
12 | # 字符集
13 | ascii_char = list("$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\|()1{}[]?-_+~<>i!lI;:,\"^`'. ")
14 | 
15 | 
16 | # RGB值转字符的函数
17 | def get_char(r, g, b, alpha=256):
18 |     if alpha == 0:
19 |         return ' '
20 |     length = len(ascii_char)
21 |     gray = int(0.2126 * r + 0.7152 * g + 0.0722 * b)
22 |     unit = (256.0 + 1) / length
23 |     return ascii_char[int(gray / unit)]
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     im = Image.open(path)
28 |     #WIDTH, HEIGHT = im.size
29 |     WIDTH, HEIGHT = 80,80
30 |     print(WIDTH, HEIGHT)
31 |     im = im.resize((HEIGHT, WIDTH), Image.NEAREST)  # 使用最近滤波
32 |     txt = ""
33 |     for h in range(HEIGHT):
34 |         for w in range(WIDTH):
35 |             txt += get_char(*im.getpixel((w, h)))
36 |         txt += '\n'
37 |     print(txt)
38 | 
39 |     with open("output.txt", "w") as f:
40 |         f.write(txt)
41 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/spiders/exampleSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.selector import Selector
 4 | from tutotial.items import TutotialItem
 5 | 
 6 | 
 7 | class Example(scrapy.Spider):
 8 |     name = 'example'
 9 |     start_urls=['http://www.jianshu.com']
10 |     url = 'http://www.jianshu.com'
11 | 
12 |     def parse(self, response):  # response即网页数据
13 |         item = TutotialItem()
14 |         selector = Selector(response)
15 |         articles = selector.xpath('//*[@id="list-container"]/ul/li')
16 |         print("huangzhiqihuangzhiqi-----")
17 | 
18 |         for article in articles:
19 |             title = article.xpath('div/a/text()').extract()
20 |             author = article.xpath('div/div[1]/div/a[1]/text()').extract()
21 |             time = article.xpath('div/div[1]/div/span/@data-shared-at').extract()
22 |             print('--------------------------------------------------------')
23 |             print(author)
24 | 
25 |             item['title'] = title
26 |             item['author'] = author
27 |             item['time'] = time
28 | 
29 |             yield item
30 | 
31 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/25 13:12
 4 | # @Author  : mazicwong
 5 | # @File    : 1.py
 6 | 
 7 | import urllib.request
 8 | 
 9 | 
10 | def saveFile(data, cnt):
11 |     path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt
12 |     f = open(path, 'wb')
13 |     f.write(data)
14 |     f.close()
15 | 
16 | 
17 | def getHtml(url, cnt):
18 |     headers = {
19 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
20 |     }
21 |     # opener = urllib.request.build_opener()
22 |     # opener.addheaders = [headers]
23 |     # html = opener.open(url).read()
24 | 
25 |     req = urllib.request.Request(url=url, headers=headers)
26 |     response = urllib.request.urlopen(req, timeout=2)
27 |     html = response.read()
28 |     # print(html)
29 |     saveFile(html, cnt)
30 | 
31 | 
32 | def getUrl():
33 |     file = open(r"E:\泰迪杯\C题样例数据\All_html\url.txt", "r")
34 |     urlList = file.readlines()
35 |     cnt = 1
36 |     for url in urlList:
37 |         getHtml(url, cnt)
38 |         cnt += 1
39 | 
40 | 
41 | def main():
42 |     getUrl()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/机器学习入门/监督/回归/房价预测.py:
--------------------------------------------------------------------------------
 1 | # 数据集: 面积,价格;  进行回归
 2 | # 已知面积,预测房屋价格
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | from sklearn import linear_model
 6 | import numpy as np
 7 | 
 8 | 
 9 | # 数据预处理
10 | data_x = []
11 | data_y = []
12 | f = open('prices.txt','r')
13 | lines = f.readlines()
14 | for line in lines:
15 |     items = line.strip().split(',')
16 |     # print ("%d %d " % (int(items[0]),int(items[1])))
17 |     data_x.append(int(items[0]))
18 |     data_y.append(int(items[1]))
19 | # plt.scatter(data_x,data_y,c='r')
20 | # plt.plot(x, linear.predict(x), c='b')
21 | # plt.xlabel('Area')
22 | # plt.ylabel('Price')
23 | # plt.show()
24 | 
25 | length = len(data_x)
26 | data_x = np.array(data_x).reshape([length,1]) # 转化为二维数组(回归函数参数需要)
27 | data_y = np.array(data_y)
28 | minx = min(data_x)
29 | maxx = max(data_x)
30 | print(minx , '  ', maxx)
31 | x = np.arange(minx,maxx).reshape([-1,1]) # 等差数列
32 | 
33 | # 训练
34 | linear = linear_model.LinearRegression()
35 | linear.fit(data_x, data_y)
36 | 
37 | # 回归方程系数,截距
38 | print('Coefficient:', linear.coef_, ';  intercept:', linear.intercept_)
39 | 
40 | plt.scatter(data_x,data_y,c='r')
41 | plt.plot(x, linear.predict(x), c='b')
42 | plt.xlabel('Area')
43 | plt.ylabel('Price')
44 | plt.show()


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import os
 9 | import json
10 | import codecs
11 | 
12 | class CrawlPipeline(object):
13 |     def __init__(self):
14 |         self.current_dir = os.getcwd()
15 | 
16 |     def process_item(self, item, spider):
17 |         dir_path = self.current_dir + '/docs/' + item['source'] + '/' + item['date']
18 |         print(dir_path)
19 |         if not os.path.exists(dir_path):
20 |             os.makedirs(dir_path)
21 |         
22 |         news_file_path = dir_path + '/' + item['newsId'] + '.json'
23 |         if os.path.exists(news_file_path) and os.path.isfile(news_file_path):
24 |             print("*****************************")
25 |             print(item['newsId'] + '.json exists, just skip')
26 |             print("*****************************")
27 |         
28 |         news_file = codecs.open(news_file_path, 'w', 'utf-8')
29 |         line = json.dumps(dict(item))
30 |         news_file.write(line)
31 |         news_file.close()
32 |         return item
33 | 


--------------------------------------------------------------------------------
/机器学习入门/matplotlib使用.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/25 10:55
 4 | # @Author  : mazicwong
 5 | # @File    : matplotlib使用.py
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | N = 5
11 | menMeans = (20, 35, 30, 35, 27)
12 | menStd = (2, 3, 4, 1, 2)
13 | 
14 | ind = np.arange(N)  # the x locations for the groups
15 | width = 0.35  # the width of the bars
16 | 
17 | fig, ax = plt.subplots()
18 | rects1 = ax.bar(ind, menMeans, width, color='r', yerr=menStd)
19 | 
20 | womenMeans = (25, 32, 34, 20, 25)
21 | womenStd = (3, 5, 2, 3, 3)
22 | rects2 = ax.bar(ind + width, womenMeans, width, color='y', yerr=womenStd)
23 | 
24 | # add some
25 | ax.set_ylabel('Scores')
26 | ax.set_title('Scores by group and gender')
27 | ax.set_xticks(ind + width)
28 | ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
29 | 
30 | ax.legend((rects1[0], rects2[0]), ('Men', 'Women'))
31 | 
32 | 
33 | def autolabel(rects):
34 |     # attach some text labels
35 |     for rect in rects:
36 |         height = rect.get_height()
37 |         ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, '%d' % int(height),
38 |                 ha='center', va='bottom')
39 | 
40 | 
41 | autolabel(rects1)
42 | autolabel(rects2)
43 | 
44 | plt.show()


--------------------------------------------------------------------------------
/泰迪杯尝试/数据爬取(未处理).py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/25 12:44
 4 | # @Author  : mazicwong
 5 | # @File    : 数据爬取(未处理).py
 6 | 
 7 | import urllib.request
 8 | 
 9 | 
10 | # 按顺序放入txt
11 | def saveFile(data, cnt):
12 |     path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt
13 |     f = open(path, 'wb')
14 |     f.write(data)
15 |     f.close()
16 | 
17 | 
18 | # 保存爬取不了的网页下来分析
19 | def saveFail(url, cnt):
20 |     path = r'E:\泰迪杯\C题样例数据\All_html\fail.txt'
21 |     f = open(path, 'ab+')
22 |     f.write(cnt + '  ' + url)
23 |     f.close()
24 | 
25 | 
26 | def getHtml(url, cnt):
27 |     headers = {
28 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
29 |     }
30 |     req = urllib.request.Request(url=url, headers=headers)
31 |     try:
32 |         response = urllib.request.urlopen(req, timeout=2)
33 |         html = response.read()
34 |         print('第%s个论坛爬取成功' % cnt)
35 |         saveFile(html, cnt)
36 |     except:
37 |         print('sorry! 第%s个论坛爬取失败' % cnt)
38 |         saveFail(url, cnt)
39 | 
40 | 
41 | def getUrl():
42 |     file = open(r"E:\泰迪杯\C题样例数据\All_html\url.txt", "r")
43 |     urlList = file.readlines()
44 |     cnt = 1
45 |     for url in urlList:
46 |         getHtml(url, cnt)
47 |         cnt += 1
48 | 
49 | 
50 | def main():
51 |     getUrl()
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/2048/2048.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # _*_ coding:utf-8 _*_
 3 | #教程https://www.shiyanlou.com/courses/running
 4 | import curses
 5 | from random import randrange,choice
 6 | from collections import defaultdict
 7 | 
 8 | 
 9 | ##用户行为
10 | actions = ['Up','Left','Down','Right','Restart','Exit']
11 | #考虑到大写开启,要获得有效键值列表
12 | letter_codes=[ord(ch) for ch in 'WASDRQwasdrq']
13 | #输入与行为进行关联
14 | actions_dict=dict(zip(letter_codes,actions*2))
15 | 
16 | ##状态机
17 | 
18 | 
19 | 
20 | def main(strscr):
21 |     def init():
22 |         #init the game
23 |         return 'Game'
24 |     def not_game(state):
25 |         #wirte down the iterface of GAMEOVER/WIN
26 |         #get what user's input,judge restart a game or close it
27 |         responses=defaultdict(lambda:state)
28 |         responses['Restart'],responses['Exit']='Init','Exit'
29 |         return responses[action]
30 |     def game():
31 |         #wirte down the chess tatus
32 |         #get the user's input about 'action'
33 |         if action=='Restart':
34 |             return 'Init'
35 |         if action=='Exit':
36 |             return 'Exit'
37 |         #if 成功移动一步
38 |             if ying:
39 |                 return 'Win'
40 |             if shibai:
41 |                 return 'Gameover'
42 |         return 'Game'
43 |     state_actions={
44 |         'Init':init,
45 |         'Win':lambda:not_game('Win')
46 |         'Gamevoer':lambda:not_game('Gameover')
47 |         'Game':game
48 |     }
49 |     state='Init'
50 |     while state != 'Exit':
51 |         state=state_actions[state]()
52 | 


--------------------------------------------------------------------------------
/机器学习入门/Numpy.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import numpy as np
 3 | 
 4 | ### 引用mnist数据
 5 | from keras.datasets import mnist
 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 7 | X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
 8 | X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
 9 | Y_train = (numpy.arange(10) == y_train[:, None]).astype(int) # 把index转换为一个one hot的矩阵
10 | Y_test = (numpy.arange(10) == y_test[:, None]).astype(int)  # Y_test.shape
11 | 
12 | ### reshape函数
13 | a = np.array([1,2,3])
14 | print(a.shape) #(3,)
15 | a = a.reshape((1,-1))  # (1,3)
16 | print(a.shape) #(1,3) 1*3矩阵
17 | 
18 | a = np.array([1,2,3,4,5,6])
19 | print(a.shape)
20 | a = a.reshape((2,-1))
21 | print(a.shape) #(2,3) 2*3矩阵(二维数组)
22 | 
23 | ### full
24 | a = np.full((3,3),0)
25 | 
26 | ### eye
27 | a = np.eye(3) #单位矩阵
28 | 
29 | ### random.random
30 | a = np.random.random((3,4))
31 | 
32 | ### indexing
33 | a = np.array([[1,2,3,4],
34 |               [5,6,7,8],
35 |               [9,10,11,12]])
36 | a[-2:, 1:3] #array[[6,7][10,11]]
37 | 
38 | ### arange
39 | np.arange(3,7)
40 | 
41 | # 数学运算
42 | a = np.array([[1,2],
43 |               [3,4]])
44 | b = np.array([[5,6],
45 |               [7,8]])
46 | a+b # np.add(a,b)
47 | a*b #对应元素相乘
48 | a.dot(b)  # 真正的矩阵乘法
49 | np.dot(a,b)
50 | 
51 | # 常用函数
52 | np.sum(a)         # 所有元素求和
53 | np.sum(a,axis=0)  # 每一列求和
54 | np.sum(a,axis=1)  # 每一行求和
55 | 
56 | np.mean(a)        # 元素和的均值
57 | np.mean(a,axis=0) # 每一列的均值
58 | 
59 | np.random.uniform(3,4) # 产生[3,4]随机小数
60 | 
61 | a.T #矩阵转置


--------------------------------------------------------------------------------
/泰迪杯尝试/爬取相似URL/3.所有小URL初步信息去标签.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/4/2 0:56
 4 | # @Author  : mazicwong
 5 | # @File    : 3.所有小URL初步信息去标签.py
 6 | 
 7 | import re
 8 | import os
 9 | 
10 | for i in range(0, 180):  #180个大URL
11 |     if os.path.exists(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % i):  #已经有爬取结果的
12 |         for cnt in (0,30):  #对爬取好的相似URL选取不大于30个html代码来去标签
13 |             if os.path.isfile(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt" % (i, cnt)):
14 |                 if not os.path.exists("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s" % cnt):  # 创建一个文件夹
15 |                     os.makedirs("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s" % cnt)
16 |                     with open(r'E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt' % (i, cnt), 'r') as file:
17 |                         str = file.read()
18 |                         str = '\n'.join(re.split(' +', str))
19 |                         str = '\n'.join(re.split('\t+', str))
20 |                         str = '\n'.join(re.split('\r+', str))
21 |                         str = '\n'.join(re.split('&nbsp', str))
22 |                         str = '\n'.join(re.split('\n+', str))
23 |                         print(str)
24 |                     with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s\%s去标签.txt" % (i, cnt),"wb") as file1:  # 一般用双引号，单引号会出问题
25 |                         file1.write(str)
26 |                         # with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s\%s_去标签.txt" % (i, cnt), "w") as file1:
27 |                         #     file1.write(str)
28 | 
29 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/爬取相似URL/2.从相似URL中下载内容.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/4/2 0:09
 4 | # @Author  : mazicwong
 5 | # @File    : 2.从相似URL中下载内容.py
 6 | 
 7 | import os
 8 | import urllib.request
 9 | 
10 | 
11 | def getHtml(url):
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
14 |     }
15 |     try:
16 |         req = urllib.request.Request(url=url, headers=headers)
17 |         response = urllib.request.urlopen(req, timeout=2)
18 |         html = response.read()
19 |         return html
20 |     except:
21 |         print("there must be somthing wrong when crawing")
22 | 
23 | 
24 | def main():
25 |     for cnt in range(1, 171):
26 |         with open("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\out%s.txt" % cnt, "r") as file:
27 |             List = file.readlines()
28 |             if len(List) != 0:
29 |                 if not os.path.exists("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % cnt):  # 创建一个文件夹
30 |                     os.makedirs("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % cnt)
31 |                     for i in range(0, len(List)):
32 |                         if i > 20:
33 |                             break
34 |                         with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt" % (cnt, i), "wb") as f:
35 |                             f.write(getHtml(List[i]))
36 |                             print("第%s个小的url处理成功" % i)
37 |                     print("第%s个URL处理成功" % cnt)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/spiders/xzhc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import scrapy
 5 | from scrapy.selector import Selector
 6 | from jnuxshc.items import JnuxshcItem
 7 | from scrapy.http import Request
 8 | 
 9 | 
10 | class xzhc(scrapy.Spider):
11 |     name = 'xzhc'  #最后要调用的名字
12 |     start_urls=['https://news.jnu.edu.cn/xshc/ll/']
13 |     url = 'https://jnu.edu.cn'
14 | 
15 |     def parse(self, response):  # response即网页数据
16 |         item = JnuxshcItem()
17 |         selector = Selector(response)
18 |         articles = selector.xpath('//*[@id="content"]/div[1]/ul/li') 
19 |         print("huangzhiqihuangzhiqi-----")
20 | 
21 |         for article in articles:
22 |             #if article.xpath('@class/text()').extract()
23 |             title = article.xpath('div[2]/div[1]/a/text()').extract()
24 |             time = article.xpath('div[2]/div[3]/text()').extract()
25 |             intro = article.xpath('div[2]/div[2]/text()').extract()
26 |             print('--------------------------------------------------------')
27 |             print(title)
28 | 
29 |             item['title'] = title
30 |             item['time'] = time
31 |             item['intro'] = intro
32 | 
33 |             yield item
34 |         
35 |         #因为有很多页,所以要递归调用
36 |         tmp_url = 'https://news.jnu.edu.cn/'
37 |         next_link = selector.xpath('//*[@class="pager"]/a[@class="next"]/@href').extract()
38 |         if next_link[0] != '/xshc/ll/List_1.html':
39 |             next_link = tmp_url+next_link[0]
40 |             yield Request(next_link,callback=self.parse) #回调函数为self.parse
41 | 
42 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/README.md:
--------------------------------------------------------------------------------
 1 | # explanation about
 2 | ### in Python 3.5
 3 | - **数据爬取  未去标签**
 4 |     - saveFile
 5 |     - saveFail
 6 |     - getHtml
 7 |     - getUrl
 8 |     
 9 | - **数据爬取  去除标签**
10 |     - replaceCharEntity
11 |     - repalce
12 |     - saveFile
13 |     - get_localfile
14 | 
15 | - **爬取相邻url用于去重**
16 |     - 考虑添加功能=>判断html总长与原来文本进行对比，避免爬到死链
17 |         - 长度相差大于70%？
18 |     - getHtml
19 |         - RETURN  True/False AND url_data
20 |     - getSimilarHtml
21 |         - FIND the root_url AND get other url among it AND compare it with the previous one
22 | 
23 | 
24 | Get the similar URL
25 | - 如何主页爬取到相似URL？
26 |     1. 爬取主页所有url，然后进行遍历，用随机数(may be it can accelerate the proceed..who knows..)
27 |     2. DFS遍历，但是最多深入到三层
28 |     3. 判断方法：在当前url对html进行匹配，看看有没有最初的url，
29 |     有的话就找到标签，然后用bs4的find("",xx.next_siblings)找到兄弟标签，
30 |     接着获取url进行判断，就用正则匹配下是否两个url只有数字不同
31 |     
32 | - A new method?
33 |     1. try guessing the regular expression of the existing URL,
34 |     and then get the root_html from the root_url,so that I can match what I want,
35 |     which means its format are familiar with the exist one,
36 |     from the html source I have already had.
37 | - 最终实现方式
38 |     1. 根据已有的URL获得主页的html
39 |     2. 然后由URL推导出相同格式的正则表达式
40 |     3. 在主页的html中匹配我的正则表达式，获得相似URL
41 | - 几个坑
42 |     1. 反向推导正则的时候，因为最终是得到string类型的pattern，
43 |     所以要用p1 = p1.encode(encoding="utf-8")转换为bytes类型，
44 |     2. 在推导正则时，如果用p1=r'http://www.baidu.com/\d\d[a-z]',
45 |     接下来在做编码的时候，\d会变成\\d，且由于加了r取消掉转义字符，
46 |     会导致匹配结果错误，还有一点就是最后有一个换行，用str=str[:-1]删掉，以后应该注意
47 |     3. 判断字符串的每个字符，不能用isalpha和isnum，因为全都是字符
48 |     4. 添加功能：已经存在且不为0的文档就不重复爬取


--------------------------------------------------------------------------------
/泰迪杯尝试/pyquery取全体文本.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/25 16:38
 4 | # @Author  : mazicwong
 5 | # @File    : pyquery取全体文本.py
 6 | 
 7 | from pyquery import PyQuery
 8 | import urllib.request
 9 | 
10 | 
11 | # 按顺序放入txt
12 | def saveFile(data, cnt):
13 |     path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt
14 |     f = open(path, 'wb')
15 |     f.write(data)
16 |     f.close()
17 |     # 上面三句也可以写成
18 |     # with open(path,'wb') as f:
19 |     #    f.write(data)
20 | 
21 | 
22 | # 保存爬取不了的网页下来分析
23 | def saveFail(url, cnt):
24 |     path = r'E:\泰迪杯\C题样例数据\All_html 去标签\fail.txt'
25 |     f = open(path, 'ab+')
26 |     f.write('%s  %s' % cnt % url)
27 |     f.close()
28 | 
29 | 
30 | def getHtml(url, cnt):
31 |     headers = {
32 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
33 |     }
34 |     req = urllib.request.Request(url=url, headers=headers)
35 |     try:
36 |         response = urllib.request.urlopen(req, timeout=2)
37 |         html = response.read()
38 |         doc = PyQuery('<div><span>toto</span><span>tata</span></div>')  # 去标签
39 |         print(doc.text())
40 |         print('第%s个论坛爬取成功' % cnt)
41 |         saveFile(doc, cnt)
42 |     except:
43 |         print('sorry! 第%s个论坛爬取失败' % cnt)
44 |         saveFail(url, cnt)
45 | 
46 | 
47 | def getUrl():
48 |     file = open(r"E:\泰迪杯\C题样例数据\All_html 去标签\url.txt", "r")
49 |     urlList = file.readlines()
50 |     cnt = 1
51 |     for url in urlList:
52 |         getHtml(url, cnt)
53 |         cnt += 1
54 | 
55 | 
56 | def main():
57 |     getUrl()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20160602/BOIMS8PF00014JB5.json:
--------------------------------------------------------------------------------
1 | {"newsId": "BOIMS8PF00014JB5", "date": "20160602", "source": "netease", "comments": {"link": "http://comment.news.163.com/news3_bbs/BOIMS8PF00014JB5.html"}, "contents": {"title": ["\u7f51\u6613\u65b0\u95fb\u9891\u9053\u9996\u9875\u6539\u7248\u610f\u89c1\u53cd\u9988"], "link": "http://news.163.com/16/0602/16/BOIMS8PF00014JB5.html", "passage": "<p>\u5404\u4f4d\u4eb2\u7231\u7684\u8bfb\u8005\u76c6\u53cb\u4eec\uff0c\u7f51\u6613\u65b0\u95fb\u9996\u9875\u65b0\u7248\u4e8e7\u67081\u65e5\u4e0a\u7ebf\u3002\u8fd9\u6b21\u6539\u7248\u5168\u9762\u5bf9\u63a5\u79fb\u52a8\u7aef\uff0c\u4e3a\u6ee1\u8db3\u7f51\u53cb\u7684\u9605\u8bfb\u4e60\u60ef\u548c\u9700\u6c42\uff0c\u65b0\u7248\u9875\u9762\u4e0e\u79fb\u52a8\u7aef\u4fdd\u6301\u4e00\u81f4\uff0c\u4f7f\u7528\u6237\r\n\u5728\u6d4f\u89c8PC\u7aef\u9875\u9762\u65f6\uff0c\u4e5f\u80fd\u50cf\u9605\u8bfb\u79fb\u52a8\u7aef\u65b0\u95fb\u4e00\u822c\u4fbf\u6377\u9ad8\u6548\u3002\u540c\u65f6\uff0c\u6211\u4eec\u6269\u5927\u9605\u8bfb\u754c\u9762\uff0c\u4f7f\u5f97\u5927\u5c4f\u5e55\u7684\u7535\u8111\u6709\u66f4\u5bbd\u5e7f\u7684\u53ef\u89c6\u7a7a\u95f4\uff0c\u65b9\u4fbf\u5927\u5bb6\u63a5\u6536\u66f4\u591a\u7684\u4fe1\u606f\u3002\u5404\u7c7b\u7b56\r\n\u5212\u90fd\u5f52\u4e8e\u5de6\u8fb9\u680f\uff0c\u65b9\u4fbf\u5927\u5bb6\u9605\u8bfb\u7f51\u6613\u72ec\u5bb6\u539f\u521b\u3002</p><p><!-- AD200x300_2 -->\n</p><p>\u5f53\u7136\uff0c\u8fd9\u53ea\u662f\u5c0f\u7f16\u4eec\u7684\u60f3\u6cd5\uff0c\u9886\u5bfc\u8bf4\u4e86\u8fd8\u8981\u95ee\u95ee\u4f60\u4eec\u600e\u4e48\u770b\u3002\u6240\u4ee5\u6211\u4eec\u5c31\u51fa\u4e86\u4e00\u4e9b\u95ee\u9898\u8ba9\u4f60\u4eec\u56de\u7b54\u3002\u8fd8\u6709\u5176\u4ed6\u60f3\u6cd5\uff0c\u6b22\u8fce\u5728\u8ddf\u8d34\u91cc\u63d0\u51fa\u54e6\uff01</p><p></p>"}, "cmtId": "BOIMS8PF00014JB5"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20161227/012771.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "012771", "comments": {"link": "http://coral.qq.com/1687685805"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012771.htm", "title": ["\u7f8e\u56fd\u597d\u5fc3\u4eba\u533f\u540d\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u63501.5\u5428\u725b\u6392"], "passage": "\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u897f\u96c5\u56fe\u4e00\u4e2a\u6148\u5584\u56e2\u4f53\u8054\u4eca\u5e74\u5723\u8bde\u8282\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u51c6\u5907\u7684\u83dc\u8272\u683c\u5916\u4e30\u5bcc\uff0c\u56e0\u4e3a\u4e00\u4f4d\u533f\u540d\u5584\u5fc3\u4eba\u58eb\u6350\u4e863500\u78c5(\u7ea61589\u516c\u65a4)\u7684\u808b\u773c\u725b\u6392\uff0c\u4e3a\u65e0\u5bb6\u53ef\u5f52\u7684\u6c11\u4f17\u8d34\u5fc3\u52a0\u83dc\u3002\u636e\u62a5\u9053\uff0c\u8be5\u56e2\u4f53\u4e3b\u53a8\u8d39\u96ea(Jordan Fisher)\u63a5\u53d7\u5a92\u4f53\u8bbf\u95ee\u65f6\u8868\u793a\uff0c\u4eca\u5e74\u5723\u8bde\u8282\u6536\u5230\u6709\u4eba\u6350\u8d60\u4e86\u9ad8\u8fbe3500\u78c5\u7684\u808b\u773c\u725b\u6392(rib-eye steak)\uff0c\u201c\u6211\u77e5\u9053\u7684\u65f6\u5019\uff0c\u5413\u4e86\u4e00\u5927\u8df3\u3002\u201d\u8d39\u96ea\u8bf4\uff1a\u201c\u8fd9\u662f\u5f88\u96be\u5f97\u7684\u4e8b\u3002\u50cf\u6211\u4eec\u8fd9\u6837\u7684\u673a\u6784\uff0c\u5e76\u4e0d\u4f1a\u5e38\u5e38\u78b0\u5230\u8fd9\u6837\u7684\u72b6\u51b5\u3002\u201d\u5728\u4eca\u5e74\u5723\u8bde\u8282\u5f53\u5929\uff0c\u524d\u5f80\u897f\u96c5\u56fe\u8be5\u6148\u5584\u56e2\u4f53\u6240\u5c5e\u6551\u6d4e\u7ad9\u5403\u996d\u7684\u6e38\u6c11\uff0c\u4e0d\u7ba1\u5927\u4eba\u6216\u5c0f\u5b69\uff0c\u6bcf\u4e2a\u4eba\u90fd\u5403\u5230\u4e86\u4e00\u4efd\u808b\u773c\u725b\u6392\u3002\u76f8\u5173\u4eba\u58eb\u8868\u793a\uff0c\u6350\u8d60\u725b\u6392\u7684\u597d\u5fc3\u4eba\u8981\u6c42\u533f\u540d\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HD3PFD0001875P.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HD3PFD0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HD3PFD0001875P.html"}, "newsId": "D8HD3PFD0001875P", "contents": {"passage": "<p class=\"f_center\"><img alt=\"\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb:\u903c\u6b7b\u6587\u79d1\u751f\" src=\"http://cms-bucket.nosdn.127.net/2d43eae0cd7c4572bf89ca5796fed59020180119163046.jpeg?imageView&amp;thumbnail=550x0\" style=\"margin: 0px auto; display: block;\"><br></p><p class=\"f_center\"><img alt=\"\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb:\u903c\u6b7b\u6587\u79d1\u751f\" src=\"http://cms-bucket.nosdn.127.net/c72d981ee6e54ddc9dedcbd04b0600e420180119163046.jpeg?imageView&amp;thumbnail=550x0\" style=\"margin: 0px auto; display: block;\"><br></p><p><!-- AD200x300_2 -->\n</p><p>\u3010\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb\uff1a\u903c\u6b7b\u6587\u79d1\u751f\u3011\u8fd1\u65e5\uff0c@\u5357\u4eac\u6797\u4e1a\u5927\u5b66 \u7684\u5b66\u751f\u5bbf\u820d\u95e8\u53e3\u8d34\u4e86\u4e00\u526f\u7279\u522b\u7684\u5bf9\u8054\uff0c\u5185\u5bb9\u7528\u5316\u5b66\u5143\u7d20\u5468\u671f\u8868\u91cc\u7684\u5143\u7d20\u7b26\u53f7\u62fc\u6210\uff0c\u7f51\u53cb\u76f4\u547c\u770b\u4e0d\u61c2\uff01\u636e\u6089\uff0c\u521b\u4f5c\u5bf9\u8054\u7684\u662f\u8be5\u6821\u751f\u7269\u4e0e\u73af\u5883\u5b66\u9662\u7684\u5927\u4e00\u5b66\u751f\u535e\u6b63\uff0c\u5bf9\u8054\u521b\u610f\u662f\u4ed6\u548c\u9ad8\u4e2d\u540c\u5b66\u4eec\u60f3\u51fa\u6765\u7684\u3002\u53ea\u770b\u56fe1\uff0c\u4f60\u80fd\u731c\u51fa\u662f\u4ec0\u4e48\u5417\uff1f</p><p></p>", "link": "http://news.163.com/18/0119/16/D8HD3PFD0001875P.html", "title": ["\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb:\u903c\u6b7b\u6587\u79d1\u751f"]}}


--------------------------------------------------------------------------------
/机器学习入门/无监督/cluster/city.txt:
--------------------------------------------------------------------------------
 1 | 北京 2959.19 730.79 749.41 513.34 467.87 1141.82 478.42 457.64
 2 | 天津 2459.77 495.47 697.33 302.87 284.19 735.97 570.84 305.08
 3 | 河北 1495.63 515.90 362.37 285.32 272.95 540.58 364.91 188.63
 4 | 山西 1406.33 477.77 290.15 208.57 201.50 414.72 281.84 212.10
 5 | 内蒙古 1303.97 524.29 254.83 192.17 249.81 463.09 287.87 192.96
 6 | 辽宁 1730.84 553.90 246.91 279.81 239.18 445.20 330.24 163.86
 7 | 吉林 1561.86 492.42 200.49 218.36 220.69 459.62 360.48 147.76
 8 | 黑龙江 1410.11 510.71 211.88 277.11 224.65 376.82 317.61 152.85
 9 | 上海 3712.31 550.74 893.37 346.93 527.00 1034.98 720.33 462.03
10 | 江苏 2207.58 449.37 572.40 211.92 302.09 585.23 429.77 252.54
11 | 浙江 2629.16 557.32 689.73 435.69 514.66 795.87 575.76 323.36
12 | 安徽 1844.78 430.29 271.28 126.33 250.56 513.18 314.00 151.39
13 | 福建 2709.46 428.11 334.12 160.77 405.14 461.67 535.13 232.29
14 | 江西 1563.78 303.65 233.81 107.90 209.70 393.99 509.39 160.12
15 | 山东 1675.75 613.32 550.71 219.79 272.59 599.43 371.62 211.84
16 | 河南 1427.65 431.79 288.55 208.14 217.00 337.76 421.31 165.32
17 | 湖北 1783.43 511.88 282.84 201.01 237.60 617.74 523.52 182.52
18 | 湖南 1942.23 512.27 401.39 206.06 321.29 697.22 492.60 226.45
19 | 广东 3055.17 353.23 564.56 356.27 811.88 873.06 1082.82 420.81
20 | 广西 2033.87 300.82 338.65 157.78 329.06 621.74 587.02 218.27
21 | 海南 2057.86 186.44 202.72 171.79 329.65 477.17 312.93 279.19
22 | 重庆 2303.29 589.99 516.21 236.55 403.92 730.05 438.41 225.80
23 | 四川 1974.28 507.76 344.79 203.21 240.24 575.10 430.36 223.46
24 | 贵州 1673.82 437.75 461.61 153.32 254.66 445.59 346.11 191.48
25 | 云南 2194.25 537.01 369.07 249.54 290.84 561.91 407.70 330.95
26 | 西藏 2646.61 839.70 204.44 209.11 379.30 371.04 269.59 389.33
27 | 陕西 1472.95 390.89 447.95 259.51 230.61 490.90 469.10 191.34
28 | 甘肃 1525.57 472.98 328.90 219.86 206.65 449.69 249.66 228.19
29 | 青海 1654.69 437.77 258.78 303.00 244.93 479.53 288.56 236.51
30 | 宁夏 1375.46 480.89 273.84 317.32 251.08 424.75 228.73 195.93
31 | 新疆 1608.82 536.05 432.46 235.82 250.28 541.30 344.85 214.40


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HLN6QA0001875P.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HLN6QA0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HLN6QA0001875P.html"}, "newsId": "D8HLN6QA0001875P", "contents": {"passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff09\n                </p><p class=\"f_center\"><img alt=\"\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b 2\u4eba\u5f53\u573a\u6b7b\u4ea1\" src=\"http://cms-bucket.nosdn.127.net/catchpic/9/96/96fc2c25b36b20e0abcc3e8b25812995.jpg?imageView&amp;thumbnail=550x0\" width=\"600\" height=\"337\" style=\"border-width: 0px; border-style: initial; vertical-align: middle; padding: 0px; max-width: 100%; height: auto !important; margin: 0px auto; display: block;\"><span style=\"text-align: justify;\">\u4e8b\u6545\u73b0\u573a</span></p><p>\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670819\u65e5\u6d88\u606f\uff0c\u4eca\u5929\u4e2d\u534812\u70b942\u5206\u5de6\u53f3\uff0c\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u6c5f\u5357\u6d88\u9632\u5927\u961f\u91d1\u78d0\u8def\u6d88\u9632\u4e2d\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff1a\u91d1\u534e\u5e02\u91d1\u4e1c\u533a\u591a\u6e56\u6c40\u6751\u6709\u623f\u5c4b\u53d1\u751f\u5012\u584c\u3002\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u7acb\u5373\u6307\u6d3e6\u8f6630\u4f4d\u6d88\u9632\u5b98\u5175\u8d76\u5f80\u73b0\u573a\u6551\u63f4\u3002\u521d\u6b65\u4f30\u8ba1\u516b\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff0c6\u4eba\u88ab\u9001\u5f80\u533b\u9662\u6551\u6cbb\u3002\u622a\u81f3\u76ee\u524d\uff0c\u6551\u63f4\u4ecd\u5728\u8fdb\u884c\u4e2d\u3002</p><p><!-- AD200x300_2 -->\n</p><p></p>", "link": "http://news.163.com/18/0119/19/D8HLN6QA0001875P.html", "title": ["\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b 2\u4eba\u5f53\u573a\u6b7b\u4ea1"]}}


--------------------------------------------------------------------------------
/ACM/cf/675E DP+greedy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/22 15:35
 4 | # @Author  : mazicwong
 5 | # @File    : 675E DP+greedy.py
 6 | 
 7 | '''
 8 | 英文： buy only tickets to stations from i+1 to ai inclusive (inclusive 表示包含在这个路段内的)
 9 | 
10 | 题意：有一个一条直线的地铁线路。给出a数组，在每个站点i只能买到去往[i+1, a[i]]内的票。
11 | 设p(i,j)为从i到j所需要的最少票数，求对所有ij的p(i,j)的和。（1=<i<j<=n）
12 | 
13 | 思路：dp[i] 是从i到后面所有站点的最小票数和
14 | 当从一个站点i到不了所有点时，会到它能到的点中a[i]最大的点x。这时就能用到dp[x]。
15 | 其中自己能走i+1~x-1点，用x-i票
16 | x能到x+1~n，用b[x]票
17 | x能走的那些中，x+1 ~ a[i]是i自己能走的，把x走的当做自己走的，更远的要自己买票走到x，要n - a[i]张票
18 | 得到：dp[i] = x-i + dp[x] + n - a[i]
19 | x能走的肯定比a[i]远，因为a[a[i]]肯定要大于a[i]
20 | 这样，我们要做的就是每次找出区间[i+1, a[i]]中a[x]最大的x
21 | 这可以用各种RMQ方法,不能用单调区间O(1)求，因为这个区间不是纯粹向左移动的，左界是一个个往左，右界是会来回动的。
22 | 所以可以维护一个只进不出的单调下降队列，然后用二分找。O(nlogn)
23 | 
24 | give:
25 | n
26 | a1,a2...an-1
27 | 
28 | 使用：deque是为了高效实现插入和删除操作的双向列表，适合用于队列和栈：
29 | 说明http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431953239820157155d21c494e5786fce303f3018c86000
30 | '''
31 | 
32 | from collections import deque
33 | 
34 | 
35 | def argmax(que, z): #二分求[i+1, a[i]]中a[x]最大的x
36 |     l = 0
37 |     r = len(que) - 1
38 |     while (l <= r):
39 |         mid = int((l + r) / 2)
40 |         x = que[mid]['i']
41 |         if (x <= z):
42 |             r = mid - 1
43 |         else:
44 |             l = mid + 1
45 |     return que[l]['i']
46 | 
47 | 
48 | def solve(n, A):
49 |     a = [0] * (n + 1)
50 |     a[1:] = A
51 |     dp = [0] * (n + 1)
52 |     dp[n - 1] = 1
53 |     que = deque()
54 |     que.append({'i': n - 1, 'a': a[n - 1]})
55 |     for i in range(n - 2, 0, -1):
56 |         if (a[i] >= n):
57 |             dp[i] = n - i
58 |         else:
59 |             x = argmax(que, a[i])
60 |             dp[i] = x - i + dp[x] + n - a[i]
61 |         while (len(que) > 0 and que[-1]['a'] < a[i]):
62 |             que.pop()
63 |         que.append({'i': i, 'a': a[i]})
64 |     return sum(dp)
65 | 
66 | 
67 | n = int(input())
68 | a = map(int, input().split(' '))
69 | print(solve(n, a))
70 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20161227/014055.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "014055", "comments": {"link": "http://coral.qq.com/1687716811"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/014055.htm", "title": ["\u6cb3\u5357\u5c0f\u4f19\u627f\u5305\u5343\u8f86\u51fa\u79df\u8f66\u9876\u706f \u6253\u51fa\u6211\u7231\u4f60\u8868\u767d"], "passage": "\u8fd9\u8f86\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d12\u670825\u65e5\uff0c\u662f\u897f\u65b9\u7684\u5723\u8bde\u8282\uff0c\u8bb8\u591a\u60c5\u4fa3\uff0c\u9009\u62e9\u5728\u8fd9\u4e00\u5929\u8868\u767d\u3002\u5f53\u5929\uff0c\u8bb0\u8005\u7684\u670b\u53cb\u5708\u88ab\u8fd9\u6837\u7684\u7167\u7247\u5237\u5c4f\u4e86\uff0c\u5185\u5bb9\u4e3a\u201c\u90ed\u00d7\u00d7\u6211\u7231\u4f60\u201d\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d\u7684\u8868\u767d\uff0c\u5728\u4fe1\u9633\u7684\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u3002\u8fd9\u5219\u201c\u571f\u8c6a\u5f0f\u201d\u7684\u8868\u767d\uff0c\u5f15\u8d77\u4e0d\u5c11\u8fc7\u5f80\u8def\u4eba\u7684\u5173\u6ce8\uff0c\u4e0d\u5c11\u7f51\u53cb\u8868\u793a\u770b\u5230\u4e86\u8fd9\u5219\u8868\u767d\u3002\u7f51\u53cb\u7eb7\u7eb7\u8bc4\u8bba\uff1a\u201c\u8c01\u8fd9\u4e48\u571f\u8c6a\uff0c\u627f\u5305\u4e86\u51fa\u79df\u8f66\u9876\u706f\uff1f\u201d\u201c\u8fd9\u4f4d\u53eb\u90ed\u00d7\u00d7\u7684\u59b9\u5b50\u4e5f\u592a\u5e78\u798f\u4e86\u5427\u3002\u201d\u8fd9\u4e2a\u5c0f\u4f19\u7684\u8868\u767d\u4e5f\u5f97\u5230\u7f51\u53cb\u4e00\u81f4\u795d\u798f\u3002\u6628\u65e5\u4e0b\u5348\uff0c\u8bb0\u8005\u4e86\u89e3\u5230\uff0c\u4fe1\u9633\u5e02\u51fa\u79df\u8f66\u4e0a\u7684\u9876\u706f\u5c4f\u5e7f\u544a\u90fd\u662f\u7531\u4fe1\u9633\u67d0\u5bb6\u5e7f\u544a\u516c\u53f8\u7edf\u4e00\u8fd0\u8425\uff0c\u8fd9\u4f4d\u5c0f\u4f19\u4e00\u5171\u5305\u4e861000\u591a\u8f86\u51fa\u79df\u8f66\uff0c\u4ef7\u683c\u4e0a\u5343\u5143\u3002\u201c\u5728\u516c\u53f8\u5e72\u4e86\u8fd9\u4e48\u4e45\uff0c\u7b2c\u4e00\u6b21\u89c1\u8fd9\u6837\u7684\u4e8b\u60c5\uff0c\u8fd9\u5c0f\u4f19\u7684\u60f3\u6cd5\u592a\u65b0\u9896\u4e86\uff0c\u628a\u72d7\u7cae\u6492\u904d\u4e86\u5168\u57ce\u5440\u3002\u201d\u4e00\u540d\u7684\u54e5\u544a\u8bc9\u8bb0\u8005\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/006763.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "006763", "comments": {"link": "http://coral.qq.com/2369396685"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006763.htm", "title": ["\u6c55\u5934\u8b66\u65b9\uff1a\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u603b\u7ecf\u7406\u521d\u67e5\u4e3a\u610f\u5916\u5760\u4ea1 \u5c06\u8ffd\u8d23\u9020\u8c23\u8005"], "passage": "\u6c55\u5934\u5e02\u516c\u5b89\u5c40\u6f84\u6d77\u5206\u5c40\u5fae\u4fe1\u516c\u53f7\u201c\u5e73\u5b89\u6f84\u6d77\u201d2018\u5e741\u670818\u65e5\u6d88\u606f\uff1a1\u670818\u65e5\u51cc\u6668\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u63a5\u5e7f\u4e1c\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u6709\u9650\u516c\u53f8\u7ba1\u7406\u4eba\u5458\u6797\u5fb7\u8d35\u62a5\u79f0\uff1a\u5176\u516c\u53f8\u603b\u7ecf\u7406\u9648\u4e50\u5f3a\u4e8e2018\u5e741\u67088\u65e5\u4e0d\u5e78\u901d\u4e16\uff0c\u8fd1\u671f\u7f51\u7edc\u4e0a\u51fa\u73b0\u5bf9\u9648\u4e50\u5f3a\u6b7b\u56e0\u6076\u610f\u4e2d\u4f24\u7684\u5fae\u535a\u548c\u89c6\u9891\u62a5\u9053\uff0c\u5bf9\u9648\u4e50\u5f3a\u7684\u58f0\u8a89\u548c\u516c\u53f8\u6b63\u5e38\u7ecf\u8425\u9020\u6210\u4e0d\u826f\u5f71\u54cd\uff0c\u5e76\u8981\u6c42\u4e25\u60e9\u9020\u8c23\u8005\u3002\u63a5\u62a5\u540e\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u8fc5\u901f\u5f00\u5c55\u8c03\u67e5\u3002\u636e\u9648\u4e50\u5f3a\u5bb6\u5c5e\u53cd\u6620\uff0c\u6839\u636e\u65b0\u52a0\u5761\u8b66\u65b9\u544a\u77e5\u7684\u521d\u6b65\u8c03\u67e5\u7ed3\u679c\uff0c\uff0c\u6b63\u5f0f\u6b7b\u4ea1\u62a5\u544a\u8981\u7b49\u8b66\u65b9\u7ed3\u6848\u540e\uff0c\u62a5\u7ecf\u6cd5\u9662\u88c1\u51b3\u540e\u624d\u6b63\u5f0f\u901a\u77e5\u4e2d\u56fd\u9a7b\u65b0\u52a0\u5761\u5927\u4f7f\u9986\u3002\u9274\u4e8e\u8fd1\u671f\u7f51\u7edc\u5a92\u4f53\u4f20\u64ad\u9648\u4e50\u5f3a\u6b7b\u56e0\u53ca\u5176\u4ed6\u4fe1\u606f\u7684\u60c5\u51b5\uff0c\u8bf7\u5e7f\u5927\u7f51\u6c11\u4e0d\u8981\u4f20\u64ad\u672a\u7ecf\u6838\u5b9e\u7684\u4fe1\u606f\uff0c\u5bf9\u4e8e\u9020\u8c23\u3001\u4f20\u8c23\u6d89\u5acc\u8fdd\u6cd5\u7684\uff0c\u516c\u5b89\u673a\u5173\u5c06\u4f9d\u6cd5\u8ffd\u7a76\u76f8\u5173\u4eba\u5458\u7684\u6cd5\u5f8b\u8d23\u4efb\u3002\u76ee\u524d\uff0c\u6709\u5173\u60c5\u51b5\u6b63\u5728\u8fdb\u4e00\u6b65\u8c03\u67e5\u4e2d\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180120/D8J1VDAJ0001875P.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D8J1VDAJ0001875P", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8J1VDAJ0001875P.html"}, "contents": {"title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "link": "http://news.163.com/18/0120/07/D8J1VDAJ0001875P.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u5927\u96fe\u9ec4\u8272\u9884\u8b66 \u6c5f\u82cf\u5b89\u5fbd\u6cb3\u5357\u6e56\u5317\u7b49\u5730\u90e8\u5206\u5730\u533a\u6709\u6d53\u96fe\uff09\n                </p><p>\u4e2d\u56fd\u5929\u6c14\u7f51\u8baf \u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff1a</p><p>\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002</p><p class=\"f_center\"><img alt=\"\u5927\u96fe\u9ec4\u8272\u9884\u8b66 \u6c5f\u82cf\u5b89\u5fbd\u6cb3\u5357\u6e56\u5317\u7b49\u5730\u90e8\u5206\u5730\u533a\u6709\u6d53\u96fe\" src=\"http://cms-bucket.nosdn.127.net/catchpic/9/96/9653dfb936dde49724ca29229e48c7a3.jpg?imageView&amp;thumbnail=550x0\" width=\"600\" height=\"486\"></p><p><!-- AD200x300_2 -->\n</p><p>\u9632\u5fa1\u6307\u5357\uff1a</p><p>1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168;</p><p>2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002</p><p></p>"}, "cmtId": "D8J1VDAJ0001875P"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8H1O67B0001899N.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D8H1O67B0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8H1O67B0001899N.html"}, "contents": {"title": ["\u7537\u5b50\u5230\u7a97\u53e3\u5904\u74064\u6761\u7f5a\u5355 \u529e\u4e8b\u5458\u5904\u7406\u4e00\u534a\u8bf4\"\u4e0b\u73ed\u4e86\""], "link": "http://news.163.com/18/0119/13/D8H1O67B0001899N.html", "passage": "<p><!--\u5f00\u59cb-->\n\t<style>\n\t#endText .video-info a{text-decoration:none;color: #000;}\n\t#endText .video-info a:hover{color:#d34747;}\n\t#endText .video-list li{overflow:hidden;float: left; list-style:none; width: 132px;height: 118px; position: relative;margin:8px 3px 0px 0px;}\n\t#entText .video-list a,#endText .video-list a:visited{text-decoration:none;color:#fff;}\n\t#endText .video-list .overlay{text-align: left; padding: 0px 6px; background-color: #313131; font-size: 12px; width: 120px; position: absolute; bottom: 0px; left: 0px; height: 26px; line-height: 26px; overflow: hidden;color: #fff; }\n\t#endText .video-list .on{border-bottom: 8px solid #c4282b;}\n\t#endText .video-list .play{width: 20px; height: 20px; background:url(http://img1.cache.netease.com/video/img14/zhuzhan/play.png);position: absolute;right: 12px; top: 62px;opacity: 0.7; color:#fff;filter:alpha(opacity=70); _background: none; _filter:progid:DXImageTransform.Microsoft.AlphaImageLoader(src=\"http://img1.cache.netease.com/video/img14/zhuzhan/play.png\"); }\n\t\n\t#endText .video-list a:hover .play{opacity: 1;filter:alpha(opacity=100);_filter:progid:DXImageTransform.Microsoft.AlphaImageLoader(src=\"http://img1.cache.netease.com/video/img14/zhuzhan/play.png\");}\n\t</style>\n\t</p><p>\u3010\u56db\u5f20\u7f5a\u5355\u5904\u7406\u4e24\uff0c\u529e\u4e8b\u5458\uff1a\u201c\u6211\u4e0b\u73ed\u4e86\u201d\u3011\u8fd1\u65e5\uff0c\u8d35\u5dde\u8d35\u9633\u7684\u8bb8\u5e08\u5085\u5230\u8f66\u7ba1\u6240\u529e\u7406\u8fdd\u7ae0\uff0c2\u670d\u52a1\u7a97\u53e3\u53ea\u5f001\u4e2a\u30024\u5c0f\u65f6\u540e\u8f6e\u5230\u4ed6\uff0c4\u6761\u8fdd\u7ae0\u521a\u529e2\u6761\uff0c\u529e\u4e8b\u5458\u8bf4\u201c\u6211\u8981\u4e0b\u73ed\u4e86\u201d\u3002\u5176\u95f4\uff0c\u5173\u95ed\u7684\u53e61\u4e2a\u7a97\u53e3\u5374\u4e3a\u201c\u719f\u4eba\u201d\u529e\u4e1a\u52a1\u3002</p><p><!-- AD200x300_2 -->\n</p><p></p>"}, "cmtId": "D8H1O67B0001899N"}


--------------------------------------------------------------------------------
/机器学习入门/keras/mnist.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from keras.models import Sequential
 3 | from keras.layers.core import Dense,Dropout,Activation
 4 | from keras.optimizers import SGD
 5 | from keras.datasets import mnist
 6 | import numpy
 7 | import h5py # save model
 8 | 
 9 | '''
10 |     第一步：选择模型
11 | '''
12 | model = Sequential()
13 | 
14 | '''
15 |    第二步：构建网络层
16 | '''
17 | model.add(Dense(500,input_shape=(784,))) # 输入层，28*28=784 (输入维度784,输出500个特征)
18 | model.add(Activation('tanh')) # 激活函数是tanh
19 | model.add(Dropout(0.5)) # 采用50%的dropout
20 | 
21 | model.add(Dense(500)) # 隐藏层节点500个
22 | model.add(Activation('tanh'))
23 | model.add(Dropout(0.5))
24 | 
25 | model.add(Dense(10)) # 输出结果是10个类别，所以维度是10
26 | model.add(Activation('softmax')) # 最后一层用softmax作为激活函数
27 | 
28 | '''
29 |    第三步：编译
30 | '''
31 | sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # 优化函数，设定学习率（lr）等参数
32 | model.compile(loss='categorical_crossentropy', optimizer=sgd) #, class_mode='categorical') # 使用交叉熵作为loss函数
33 | 
34 | '''
35 |    第四步：训练
36 |    .fit的一些参数
37 |    batch_size：对总的样本数进行分组，每组包含的样本数量
38 |    epochs ：训练次数
39 |    shuffle：是否把数据随机打乱之后再进行训练
40 |    validation_split：拿出百分之多少用来做交叉验证
41 |    verbose：屏显模式 0：不输出  1：输出进度  2：输出每次的训练结果
42 | '''
43 | (X_train, y_train), (X_test, y_test) = mnist.load_data() # 使用Keras自带的mnist工具读取数据（第一次需要联网）
44 | # 由于mist的输入数据维度是(num, 28, 28)，这里需要把后面的维度直接拼起来变成784维
45 | X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])
46 | X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2])
47 | Y_train = (numpy.arange(10) == y_train[:, None]).astype(int) # 把index转换为一个one hot的矩阵
48 | Y_test = (numpy.arange(10) == y_test[:, None]).astype(int)  # Y_test.shape
49 | 
50 | model.fit(X_train,Y_train,batch_size=200,epochs=1,shuffle=True,verbose=1,validation_split=0.3) # loss 0.54 -> 0.22
51 | model.evaluate(X_test, Y_test, batch_size=200, verbose=1)
52 | 
53 | '''
54 |     第五步：输出
55 | '''
56 | print("test set")
57 | scores = model.evaluate(X_test,Y_test,batch_size=200,verbose=0)
58 | print("")
59 | print("The test loss is %f" % scores)
60 | result = model.predict(X_test,batch_size=200,verbose=0)
61 | 
62 | result_max = numpy.argmax(result, axis = 1)
63 | test_max = numpy.argmax(Y_test, axis = 1)
64 | 
65 | result_bool = numpy.equal(result_max, test_max)
66 | true_num = numpy.sum(result_bool)
67 | print("")
68 | print("The accuracy of the model is %f" % (true_num/len(result_bool)))
69 | 
70 | 
71 | '''
72 |     第六步：保存模型(可选)
73 | '''
74 | # model.save('my_model.h5')
75 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20161227/007056.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "007056", "comments": {"link": "http://coral.qq.com/1687570251"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/007056.htm", "title": ["\u6e56\u5357\u4e00\u5c0f\u5b66\u804c\u5de5\u7325\u4eb5\u5973\u751f \u5973\u5a7f\u7cfb\u8be5\u6821\u6559\u5bfc\u4e3b\u4efb"], "passage": "\u6e56\u5357\u90b5\u9633\u4e00\u5c0f\u5b66\u98df\u5802\u7537\u5b50\u7325\u4eb511\u5c81\u5973\u751f\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c4026\u65e522\u65f6\u8bb8\u53d1\u5e03\u901a\u62a5\u79f0\uff0c12\u670824\u65e5\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c40\u7834\u83b7\u4e00\u8d77\u7325\u4eb5\u513f\u7ae5\u6848\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u88ab\u4f9d\u6cd5\u91c7\u53d6\u5211\u4e8b\u5f3a\u5236\u63aa\u65bd\u3002\u901a\u62a5\u79f0\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u4eca\u5e7464\u5c81\uff0c\u5c0f\u5b66\u6587\u5316\uff0c\u65b0\u90b5\u53bf\u576a\u4e0a\u9547\u4eba\uff0c\u79df\u4f4f\u5728\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u67d0\u5c0f\u5b66\u5916\u67d0\u6c11\u623f\u5185\uff0c\u7cfb\u8be5\u5c0f\u5b66\u52e4\u6742\u5de5\u300212\u670824\u65e5\u4e0b\u5348\uff0c\u8be5\u5206\u5c40\u77f3\u6865\u6d3e\u51fa\u6240\u63a5\u5230\u62a5\u8b66\uff0c\u8f96\u533a\u67d0\u5c0f\u5b66\u5185\u6709\u4eba\u6253\u67b6\u3002\u6c11\u8b66\u8fc5\u901f\u8d76\u5230\u73b0\u573a\uff0c\u5c06\u53cc\u65b9\u5e26\u56de\u516c\u5b89\u673a\u5173\u8fdb\u884c\u8c03\u67e5\u3002\u7ecf\u67e5\uff0c\u5f53\u65e5\u5973\u751f\u5bb6\u957f\u5f97\u77e5\u5f53\u4e8b\u5973\u751f\u88ab\u5218\u67d0\u591a\u6b21\u7325\u4eb5\u540e\uff0c\u4fbf\u6765\u5230\u5b66\u6821\u627e\u5176\u7406\u8bba\uff0c\u53cc\u65b9\u53d1\u751f\u4e89\u6267\uff0c\u5218\u67d0\u906d\u5973\u751f\u5bb6\u5c5e\u6bb4\u6253\u3002\u7ecf\u5ba1\u8baf\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u5bf9\u7325\u4eb5\u8be5\u5973\u751f\u7684\u72af\u7f6a\u4e8b\u5b9e\u4f9b\u8ba4\u4e0d\u8bb3\u3002\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c\u6d89\u6848\u7537\u5b50\u5218\u67d0\u7cfb\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u4f58\u6e56\u5c0f\u5b66\u98df\u5802\u5de5\u4f5c\u4eba\u5458\uff0c\u6d89\u5acc\u7325\u4eb5\u8be5\u6821\u4e00\u540d11\u5c81\u7684\u4e94\u5e74\u7ea7\u5973\u751f\u3002\u5218\u67d0\u5728\u5b66\u6821\u5de5\u4f5c\u4e00\u5e74\u591a\u65f6\u95f4\uff0c\u5176\u5973\u5a7f\u662f\u4f58\u6e56\u5c0f\u5b66\u6559\u5bfc\u5904\u4e3b\u4efb\u3002\u4f58\u6e56\u5c0f\u5b66\u6821\u957f\u5f20\u98de\u8dc312\u670826\u65e5\u4e0b\u5348\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u901a\u62a5\u79f0\uff0c\u76ee\u524d\uff0c\u6848\u4ef6\u6b63\u5728\u8fdb\u4e00\u6b65\u4fa6\u67e5\u4e2d\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/002903.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "002903", "comments": {"link": "http://coral.qq.com/2369176633"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/002903.htm", "title": ["\u4eca\u5e74\u6625\u8fd0\u56de\u7a0b\u706b\u8f66\u7968\u9996\u6b21\u6253\u6298 \u90e8\u5206\u56de\u7a0b\u7968\u6700\u4f4e8\u6298"], "passage": "2018\u5e74\u6625\u8fd0\u81ea2\u67081\u53f7\u5f00\u59cb\uff0c3\u670812\u53f7\u7ed3\u675f\uff0c\u517140\u5929\u3002\u4eca\u5e74\u6625\u8fd0\u671f\u95f4\uff0c\u94c1\u8def\u90e8\u95e8\u9996\u6b21\u5bf9\u90e8\u5206\u589e\u5f00\u7684\u5217\u8f66\u56de\u7a0b\u7968\u4ef7\u8bd5\u70b9\u6298\u6263\uff0c\u5728\u73b0\u884c\u7968\u4ef7\u57fa\u7840\u4e0a\u5b9e\u884c8~9\u6298\u4f18\u60e0\u3002\u7531\u4e8e\u6625\u8fd0\u5177\u6709\u5355\u65b9\u5411\u5ba2\u6d41\u7279\u70b9\uff0c\u90e8\u5206\u5217\u8f66\u53bb\u7a0b\u5ba2\u6d41\u96c6\u4e2d\u4f46\u8fd4\u7a0b\u5ba2\u6d41\u8f83\u5c11\u3002\u6b64\u6b21\u56de\u7a0b\u65b9\u5411\u90e8\u5206\u5217\u8f66\u8bd5\u70b9\u7968\u4ef7\u6253\u6298\uff0c\u4e3b\u8981\u56f4\u7ed5\u4eac\u6d25\u3001\u6caa\u676d\u3001\u5e7f\u6df13\u4e2a\u5730\u533a\u59cb\u53d1\u7ec8\u5230\u7684\u5217\u8f66\uff0c\u8282\u524d\u8282\u540e\u5206\u522b\u9009\u53d6\u4e8632\u8d9f\u5217\u8f66\u5b9e\u884c\u6253\u6298\u4f18\u60e0\u30022\u67081\u65e5\u81f32\u670815\u65e5\uff0c\u8282\u524d\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4eac\u6d25\u5730\u533a\u768412\u8d9f\uff1b\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6caa\u676d\u5730\u533a\u768411\u8d9f\uff1b\u6210\u6e1d\u3001\u6e56\u5357\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u5e7f\u6df1\u5730\u533a\u76849\u8d9f\u30022\u670816\u65e5\u81f33\u670812\u65e5\uff0c\u8282\u540e\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4eac\u6d25\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u65b9\u541111\u8d9f\uff1b\u6caa\u676d\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u65b9\u541110\u8d9f\uff1b\u5e7f\u6df1\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6e56\u5357\u3001\u5357\u660c\u7b49\u65b9\u541111\u8d9f\u3002\u94c1\u8def\u90e8\u95e8\u63d0\u793a\uff0c\u65c5\u5ba2\u670b\u53cb\u53ef\u901a\u8fc7\u4e2d\u56fd\u94c1\u8def\u5ba2\u6237\u670d\u52a1\u4e2d\u5fc312306\u7f51\u7ad9\u67e5\u8be2\u5177\u4f53\u6298\u6263\u8f66\u6b21\u76f8\u5173\u4fe1\u606f\uff0c\u5408\u7406\u5b89\u6392\u51fa\u884c\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180120/D8IUD7L60001899O.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D8IUD7L60001899O", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8IUD7L60001899O.html"}, "contents": {"title": ["\u7f8e\u53f8\u6cd5\u90e8\u5c06\u4ee5\u6b7b\u5211\u8d77\u8bc9\u7ae0\u83b9\u9896\u6848\u5acc\u72af \u5bb6\u5c5e\u8868\u793a\u6b23\u6170"], "link": "http://news.163.com/18/0120/06/D8IUD7L60001899O.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\uff09\n                </p><p class=\"f_center\"><img alt=\"\u7f8e\u53f8\u6cd5\u90e8\u5c06\u4ee5\u6b7b\u5211\u8d77\u8bc9\u7ae0\u83b9\u9896\u6848\u5acc\u72af \u5bb6\u5c5e\u8868\u793a\u6b23\u6170\" src=\"http://cms-bucket.nosdn.127.net/c25c1ddf086444e0b1a9e7d97b1396ae20180120065342.jpeg?imageView&amp;thumbnail=550x0\" style=\"margin: 0px auto; display: block;\"></p><p>\u3010\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u3011\u7f8e\u56fd\u8054\u90a6\u653f\u5e9c\u4e8e\u5f53\u5730\u65f6\u95f4\u5468\u4e94\u4e0b\u5348\u53d1\u8868\u7531\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u6770\u592b\u00b7\u585e\u7533\u65af\uff08Jeff Sessions\uff09\u7b7e\u7f72\u7684\u6587\u4ef6\uff0c\u51b3\u5b9a\u5bf9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u4e2d\u56fd\u8bbf\u95ee\u5b66\u8005\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u5e03\u5170\u767b\u7279\u514b\u91cc\u65af\u6ed5\u68ee\uff08Brendt Christensen\uff09\u5bfb\u6c42\u6b7b\u5211\u3002\u8fd9\u4efd\u6587\u4ef6\u6307\u51fa\u6839\u636e\u5927\u966a\u5ba1\u56e22017\u5e7410\u67083\u65e5\u5bf9\u514b\u91cc\u65af\u6ed5\u68ee\u63d0\u51fa\u7684\u8ffd\u52a0\u8d77\u8bc9\u4e66\u5185\u5bb9 \u2014 \u5acc\u72af\u6545\u610f\u975e\u6cd5\u631f\u6301\u3001\u7981\u9522\u3001\u8bf1\u9a97\u3001\u7ed1\u67b6\u3001\u52ab\u6301\u7ae0\u83b9\u9896\u5e76\u6700\u7ec8\u5bfc\u81f4\u5176\u6b7b\u4ea1\uff0c \u4ee5\u6b7b\u5211\u8d77\u8bc9\u5acc\u72af\u662f\u5408\u7406\u7684\u3002\u7ae0\u83b9\u9896\u5bb6\u4eba\u7684\u4ee3\u7406\u5f8b\u5e08\u738b\u5fd7\u4e1c\u8868\u793a\uff0c\u5bb6\u4eba\u5bf9\u53f8\u6cd5\u90e8\u957f\u7684\u51b3\u5b9a\u8868\u793a\u6b23\u6170\uff0c\u611f\u8c22\u4ed6\u548c\u5f53\u5730\u68c0\u5bdf\u5b98\u8003\u8651\u5e76\u5c0a\u91cd\u5bb6\u4eba\u7684\u8bf7\u6c42\uff0c\u505a\u51fa\u4e86\u4e0e\u5bb6\u4eba\u610f\u613f\u76f8\u7b26\u7684\u51b3\u5b9a\u3002\u76ee\u524d\uff0c\u539f\u5b9a\u4e8e2\u670827\u65e5\u5f00\u5ba1\u7684\u65f6\u95f4\u4e0d\u53d8\u3002</p><p></p><p></p>"}, "cmtId": "D8IUD7L60001899O"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/004328.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "004328", "comments": {"link": "http://coral.qq.com/2369236201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004328.htm", "title": ["2018\u6625\u8282\u9ec4\u91d1\u5468\u653e\u5047\u53bb\u54ea\u73a9\uff1f\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u4e3a\u70ed\u95e8\u76ee\u7684\u5730"], "passage": "2018\u5e74\u7684\u6625\u8282\u4e00\u5929\u5929\u4e34\u8fd1\uff0c\u867d\u7136\u8fc7\u5e74\u56de\u5bb6\u662f\u4e2d\u56fd\u4eba\u7684\u4f20\u7edf\uff0c\u800c\u636e\u4e2d\u56fd\u65c5\u6e38\u7814\u7a76\u9662\u8c03\u67e5\u663e\u793a\uff0c\u4eca\u5e74\u6625\u8282\uff0c\u5927\u5bb6\u7684\u51fa\u6e38\u610f\u613f\u4e5f\u5f88\u5f3a\u70c8\u3002\u6570\u636e\u663e\u793a\uff0c2018\u5e74\u7b2c\u4e00\u5b63\u5ea6\u5c45\u6c11\u51fa\u6e38\u610f\u613f\u4e3a83%\uff0c\u800c\u9009\u62e9\u5728\u6625\u8282\u671f\u95f4\u51fa\u6e38\u7684\u6e38\u5ba2\u5360\u4e00\u5b63\u5ea6\u6e38\u5ba2\u768448.9%\uff0c\u7814\u5b66\u3001\u6d77\u5c9b\u6e38\u3001\u6e38\u8f6e\u6e38\u3001\u51b0\u96ea\u6e38\u3001\u4eb2\u5b50\u5bb6\u5ead\u6e38\u3001\u4e3b\u9898\u6e38\u5e02\u573a\u70ed\u5ea6\u8f83\u9ad8\u3002\u60a8\u4eca\u5e74\u6709\u4ec0\u4e48\u51fa\u6e38\u8ba1\u5212\u5417\uff1f\u6211\u53ef\u80fd\u4f1a\u53bb\u897f\u5b89\u90a3\u8fb9\uff0c\u56e0\u4e3a\u90a3\u8fb9\u53ef\u80fd\u5e74\u5473\u4f1a\u6bd4\u8f83\u91cd\u3002\u6211\u4e00\u822c\u60f3\u53bb\u4e09\u4e9a\uff0c\u56e0\u4e3a\u5317\u65b9\u7279\u522b\u51b7\uff0c\u5357\u65b9\u6bd4\u8f83\u70ed\uff0c\u6bd4\u8f83\u8212\u670d\u4e00\u70b9\u3002\u6625\u8282\u671f\u95f4\uff0c\u9009\u62e9\u56fd\u5185\u8de8\u7701\u5e02\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a65.9%\uff0c\u56fd\u5185\u70ed\u95e8\u57ce\u5e02\u5305\u62ec\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u3001\u676d\u5dde\u3001\u53a6\u95e8\u7b49\uff0c\u9009\u62e9\u8fd1\u90ca\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a34.5%\u3002\u8c03\u67e5\u663e\u793a\uff0c\u5ea6\u5047\u4f11\u95f2\u3001\u89c2\u5149\u65c5\u6e38\u548c\u63a2\u9669\u662f\u5c45\u6c11\u6625\u8282\u51fa\u6e38\u7684\u4e3b\u8981\u52a8\u673a\u3002\u517b\u751f\u548c\u8fd0\u52a8\u4e3a\u4e3b\u7684\u5065\u5eb7\u6e38\u5c06\u6210\u4e3a\u4eca\u5e74\u7684\u65b0\u5ba0\uff0c\u65c5\u6e38\u53d1\u5c55\u6b63\u5728\u5411\u4e2d\u9ad8\u7ea7\u6f14\u5316\u3002\u5728\u5168\u57df\u65c5\u6e38\u65f6\u4ee3\uff0c\u90a3\u5b9e\u9645\u4e0a\u5e7f\u5927\u7684\u6e38\u5ba2\uff0c\u8d8a\u6765\u8d8a\u591a\u7684\u6e17\u900f\u5230\u65c5\u6e38\u76ee\u7684\u5730\u7684\u751f\u6d3b\u65b9\u5f0f\u548c\u4f11\u95f2\u7a7a\u95f4\u91cc\u53bb\u4e86\u3002\u5927\u5bb6\u8d8a\u6765\u8d8a\u5f3a\u8c03\u65c5\u6e38\u7684\u54c1\u8d28\u4e86\uff0c\u4e8b\u5b9e\u4e0a\u6211\u4eec2018\u5e74\u56fd\u5bb6\u65c5\u6e38\u5de5\u4f5c\u7684\u4e3b\u9898\u5c31\u662f\u4f18\u8d28\u65c5\u6e38\u5e74\u3002"}}


--------------------------------------------------------------------------------
/机器学习入门/监督/分类/人体运动状态信息评级.py:
--------------------------------------------------------------------------------
 1 | # SVM
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from sklearn.preprocessing import Imputer #预处理模块
 7 | from sklearn.model_selection import train_test_split #生成数据模块
 8 | from sklearn.metrics import classification_report #评估模块
 9 | # 导入分类器模块
10 | from sklearn.neighbors import KNeighborsClassifier
11 | from sklearn.tree import DecisionTreeClassifier
12 | from sklearn.naive_bayes import GaussianNB
13 | 
14 | # 数据处理,传入特征列表,和标签列表
15 | def load_datasets(feature_paths, label_paths):
16 |     feature = np.ndarray(shape=(0,41))  # 列41,特征维度41 (想象成一个41维的列向量)
17 |     label = np.ndarray(shape=(0,1))     # 列1,标签维度1
18 |     for file in feature_paths:
19 |         file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/'+file
20 |         df = pd.read_table(file, delimiter=',', na_values='?', header=None)
21 |         imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
22 |         imp.fit(df)
23 |         df = imp.transform(df)
24 |         feature = np.concatenate((feature, df))
25 | 
26 |     for file in label_paths:
27 |         file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/' + file
28 |         df = pd.read_table(file, header=None)
29 |         label = np.concatenate((label, df))
30 | 
31 |     label = np.ravel(label)
32 |     return feature, label
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     ''' 数据路径 '''
37 |     featurePaths = ['A/A.feature', 'B/B.feature', 'C/C.feature', 'D/D.feature', 'E/E.feature']
38 |     labelPaths = ['A/A.label', 'B/B.label', 'C/C.label', 'D/D.label', 'E/E.label']
39 |     ''' 读入数据  '''
40 |     x_train, y_train = load_datasets(featurePaths[:4], labelPaths[:4])
41 |     x_test, y_test = load_datasets(featurePaths[4:], labelPaths[4:])
42 |     x_train, x_, y_train, y_ = train_test_split(x_train, y_train, test_size=0.0)
43 | 
44 |     print('Start training knn')
45 |     knn = KNeighborsClassifier().fit(x_train, y_train)
46 |     print('Training done')
47 |     answer_knn = knn.predict(x_test)
48 |     print('Prediction done')
49 | 
50 |     print('Start training DT')
51 |     dt = DecisionTreeClassifier().fit(x_train, y_train)
52 |     print('Training done')
53 |     answer_dt = dt.predict(x_test)
54 |     print('Prediction done')
55 | 
56 |     print('Start training Bayes')
57 |     gnb = GaussianNB().fit(x_train, y_train)
58 |     print('Training done')
59 |     answer_gnb = gnb.predict(x_test)
60 |     print('Prediction done')
61 | 
62 |     print('\n\nThe classification report for knn:')
63 |     print(classification_report(y_test, answer_knn))
64 |     print('\n\nThe classification report for DT:')
65 |     print(classification_report(y_test, answer_dt))
66 |     print('\n\nThe classification report for Bayes:')
67 |     print(classification_report(y_test, answer_gnb))


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20161227/012170.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "012170", "comments": {"link": "http://coral.qq.com/1687671711"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012170.htm", "title": ["\u7f8e\u56fd\u7537\u5b50\u6b32\u9886\u517b\u732b\u54aa \u610f\u5916\u4e0e\u8d70\u5931\u6570\u6708\u7231\u732b\u91cd\u9022"], "passage": "\u8d44\u6599\u56fe\uff1a\u732b\u54aa\u3002\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u4f5b\u7f57\u91cc\u8fbe\u5dde\u4e00\u540d\u7537\u5b50\u7684\u7231\u732b\u8d70\u5931\u6570\u6708\uff0c\u65e5\u524d\u8fd9\u540d\u7537\u5b50\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u5230\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\uff0c\u6253\u7b97\u9886\u517b\u732b\u54aa\uff0c\u7ed3\u679c\u7adf\u7136\u610f\u5916\u4e0e\u7231\u732b\u56e2\u5706\u3002\u6770\u514b\u68ee\u7ef4\u5c14\u7684\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\u65e5\u524d\u5728\u793e\u4ea4\u7f51\u7ad9\u4e0a\u5206\u4eab\u540d\u53eb\u201c\u90a6\u90a6\u201d(Bon Bon)\u7684\u732b\u54aa\u4e0e\u4e3b\u4eba\u4e45\u522b\u91cd\u9022\u7684\u6545\u4e8b\uff0c\u83b7\u5f97\u7f51\u53cb\u70ed\u70c8\u56de\u54cd\u3002\u6536\u5bb9\u4e2d\u5fc3\u7684\u52a8\u7269\u534f\u4f1a\u8868\u793a\uff1a\u201c\u90a6\u90a6\u4ece\u4eca\u5e7410\u6708\u521d\u5c31\u5230\u6211\u4eec\u8fd9\u8fb9\u4e86\uff0c\u6211\u4eec\u4e0d\u77e5\u9053\u4e3a\u4ec0\u4e48\uff0c\u5bf9\u5b83\u6765\u8bf4\u4e00\u76f4\u5f88\u96be\u627e\u5230\u9886\u517b\u5bb6\u5ead\u3002\u539f\u6765\uff0c\u8fd9\u5f53\u4e2d\u6709\u4e2a\u975e\u5e38\u7279\u6b8a\u7684\u7406\u7531\u3002\u201d\u4e00\u540d\u5e74\u8f7b\u7537\u5b5021\u65e5\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u6765\u5230\u8be5\u534f\u4f1a\uff0c\u6253\u7b97\u9886\u517b\u4e00\u53ea\u732b\u54aa\uff0c\u56e0\u4e3a\u4ed6\u5fc3\u7231\u7684\u732b\u54aa\u51e0\u4e2a\u6708\u4e4b\u524d\u8d70\u4e22\u4e86\uff0c\u4ed6\u60f3\u8981\u518d\u627e\u4e00\u53ea\u732b\u54aa\u6765\u966a\u4f34\u3002\u7ed3\u679c\uff0c\u8fd9\u540d\u7537\u5b50\u5728\u6536\u5bb9\u4e2d\u5fc3\u7684\u6240\u6709\u732b\u54aa\u5f53\u4e2d\uff0c\u53d1\u73b0\u4e86\u4e00\u53ea\u5bb3\u7f9e\u7684\u6df1\u8272\u5c0f\u732b\uff0c\u770b\u8d77\u6765\u8ddf\u8d70\u4e22\u597d\u51e0\u4e2a\u6708\u7684\u7231\u732b\u957f\u5f97\u5f88\u50cf\uff0c\u7ed3\u679c\u67e5\u8bc1\u4e4b\u4e0b\uff0c\u53d1\u73b0\u88ab\u5de5\u4f5c\u4eba\u5458\u53d6\u540d\u4e3a\u201c\u90a6\u90a6\u201d\u7684\u8fd9\u53ea\u6bcd\u732b\uff0c\u539f\u6765\u5c31\u81ea\u5df1\u517b\u7684\u201c\u5bc6\u65af\u8482\u201d\u3002\u5de5\u4f5c\u4eba\u5458\u8868\u793a\uff0c\u201c\u90a6\u90a6\u201d\u8d70\u5931\u4e0d\u4e45\uff0c\u5c31\u88ab\u70ed\u5fc3\u6c11\u4f17\u6361\u5230\uff0c\u9001\u6765\u6536\u5bb9\u4e2d\u5fc3\uff0c\u201c\u73b0\u5728\u5b83\u7ec8\u4e8e\u53ef\u4ee5\u56de\u5bb6\uff0c\u56e2\u5706\u8fc7\u8282\uff0c\u56de\u5230\u771f\u6b63\u5c5e\u4e8e\u5b83\u7684\u5bb6\u3002\u201d"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HBI8IF0001875P.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HBI8IF0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HBI8IF0001875P.html"}, "newsId": "D8HBI8IF0001875P", "contents": {"passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u4ea4\u901a\u8fd0\u8f93\u90e8\uff1a\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa\uff09\n                </p><p><!--@@LinkCard=\"http://live.163.com/3g/livelog/169214/30/desc/0/1/page.do\" Title=\"\"\u6851\u5409\"\u8f6e\u71c3\u7206\u4e8b\u6545\u53d1\u5e03\u4f1a\" ImgSrc=\"http://cms-bucket.nosdn.127.net/705ad8e443e14b4ba02e166e49bf53de20170224175001.jpeg\" Digest=\"\"@@--></p><p class=\"f_center\"><img alt=\"\u4ea4\u901a\u90e8:\u6851\u5409\u6cb9\u8f6e\u4e8b\u6545\u6551\u63f4\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa\" id=\"12422470\" src=\"http://cms-bucket.nosdn.127.net/catchpic/5/5d/5d5f31d3ff010f1d0ef63e6f4e9ef442.jpg?imageView&amp;thumbnail=550x0\" sourcedescription=\"\u7f16\u8f91\u63d0\u4f9b\u7684\u672c\u5730\u6587\u4ef6\" sourcename=\"\u672c\u5730\u6587\u4ef6\" data-bd-imgshare-binded=\"1\" border=\"0\" style=\"margin: 0px auto; display: block;\"></p><p><!-- AD200x300_2 -->\n</p><p>\u4e2d\u9752\u5728\u7ebf\u5317\u4eac1\u670819\u65e5\u7535 \u4eca\u5929\u4e0b\u5348\uff0c\u4ea4\u901a\u8fd0\u8f93\u90e8\u53ec\u5f00\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u65b0\u95fb\u53d1\u5e03\u4f1a\u3002</p><p>\u4e2d\u56fd\u6d77\u4e0a\u641c\u6551\u4e2d\u5fc3\u526f\u4e3b\u4efb\u3001\u4ea4\u901a\u8fd0\u8f93\u90e8\u5e94\u6025\u529e\u4e3b\u4efb\u667a\u5e7f\u8def\u8868\u793a\uff0c\u8fd9\u6b21\u5e94\u6025\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u5f88\u9ad8\uff0c\u4e16\u754c\u822a\u8fd0\u53f2\u4e0a\u5c1a\u65e0\u6cb9\u8239\u8f7d\u8fd0\u201c\u51dd\u6790\u6cb9\u201d\u88ab\u649e\u5931\u706b\u7684\u4e8b\u6545\u53d1\u751f\uff0c\u201c\u5e94\u6025\u5904\u7f6e\u65e0\u5148\u4f8b\u53ef\u5faa\u3002\u201d</p><p>2018\u5e741\u67086\u65e5\u665a\uff0c\u5df4\u62ff\u9a6c\u7c4d\u6cb9\u8239\u201c\u6851\u5409\u201d\u8f6e\u4e0e\u4e2d\u56fd\u9999\u6e2f\u7c4d\u6563\u8d27\u8239\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u5728\u957f\u6c5f\u53e3\u4ee5\u4e1c\u7ea6160\u6d77\u91cc\u5904\u53d1\u751f\u78b0\u649e\u3002\u4e8b\u6545\u5bfc\u81f4\u201c\u6851\u5409\u201d\u8f6e\u8d27\u8239\u8d77\u706b\uff0c32\u540d\u8239\u5458\u5931\u8e2a\uff0c\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u53d7\u635f\u8d77\u706b\uff0c21\u540d\u8239\u5458\u5f03\u8239\u9003\u751f\u540e\u88ab\u9644\u8fd1\u6e14\u8239\u6551\u8d77\u3002</p><p></p>", "link": "http://news.163.com/18/0119/16/D8HBI8IF0001875P.html", "title": ["\u4ea4\u901a\u90e8:\"\u6851\u5409\"\u6cb9\u8f6e\u4e8b\u6545\u6551\u63f4\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa"]}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HJ2GAK000187VE.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HJ2GAK000187VE", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8HJ2GAK000187VE.html"}, "newsId": "D8HJ2GAK000187VE", "contents": {"passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u53a6\u822a\u5c31\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed\u53d1\u58f0\uff1a\u4e25\u91cd\u5f71\u54cd\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\uff09\n                </p><p>@\u53a6\u95e8\u822a\u7a7a\u5b98\u65b9\u5fae\u535a1\u670819\u65e5\u6d88\u606f\uff0c\u6625\u8282\u662f\u4e2d\u534e\u6c11\u65cf\u6700\u91cd\u8981\u7684\u4f20\u7edf\u8282\u65e5\u3002\u4e3a\u4e86\u6ee1\u8db32018\u5e74\u6625\u8282\u671f\u95f4\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u7684\u9700\u6c42\uff0c\u53a6\u95e8\u822a\u7a7a\u6309\u7167\u60ef\u4f8b\uff0c\u7279\u522b\u8c03\u6574\u8fd0\u529b\uff0c\u7533\u8bf7\u589e\u52a070\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\uff0c\u4e3b\u8981\u5305\u62ec\u4ece\u798f\u5dde\u3001\u53a6\u95e8\u3001\u676d\u5dde\u5f80\u8fd4\u53f0\u6e7e\u7684\u822a\u73ed\uff0c\u8ba9\u53f0\u6e7e\u540c\u80de\u53ef\u4ee5\u901a\u8fc7\u6700\u4fbf\u6377\u7684\u65b9\u5f0f\u5f80\u8fd4\u4e24\u5cb8\uff0c\u6b22\u5ea6\u65b0\u6625\u4f73\u8282\u3002\u76ee\u524d\u5df2\u6709\u8d85\u8fc71\u4e07\u540d\u65c5\u5ba2\u9884\u8ba2\u76f8\u5173\u822a\u73ed\u673a\u7968\uff0c\u9884\u8ba1\u6625\u8282\u671f\u95f4\u5c06\u6709\u8d85\u8fc72\u4e07\u540d\u65c5\u5ba2\u4e58\u5750\u53a6\u822a\u4e24\u5cb8\u52a0\u73ed\u822a\u73ed\u3002</p><p>\u76ee\u524d\uff0c\u53d7\u53f0\u6e7e\u65b9\u9762\u5e72\u9884\uff0c\u53a6\u822a\u6839\u636e\u4e24\u5cb8\u5e02\u573a\u9700\u6c42\u7533\u8bf7\u768470\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\u53ef\u80fd\u65e0\u6cd5\u6267\u884c\uff0c\u8fd9\u5c06\u4e25\u91cd\u5f71\u54cd\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u4e0e\u4eb2\u4eba\u56e2\u805a\u7684\u884c\u7a0b\u5b89\u6392\u3002\u6b64\u4e3e\u7ed9\u822a\u4f01\u9020\u6210\u7684\u7ecf\u6d4e\u635f\u5931\u4e8b\u5c0f\uff0c\u7ed9\u4e24\u5cb8\u6c11\u4f17\u5f80\u6765\u5e26\u6765\u7684\u6781\u5927\u4e0d\u4fbf\u4e8b\u5927\u3002</p><p>\u53a6\u822a\u81ea\u6210\u7acb\u4ee5\u6765\u4fbf\u4ee5\u201c\u670d\u52a1\u4e24\u5cb8\u201d\u4e3a\u4f7f\u547d\uff0c\u6210\u4e3a\u4e24\u5cb8\u76f4\u822a\u7684\u53c2\u4e0e\u8005\u3001\u89c1\u8bc1\u8005\u548c\u63a8\u8fdb\u8005\uff0c\u5728\u6d77\u5ce1\u4e24\u5cb8\u4e4b\u95f4\u67b6\u8d77\u4e86\u4fbf\u6377\u7684\u7a7a\u4e2d\u6865\u6881\u3002\u5728\u6b64\u5f3a\u70c8\u547c\u5401\u53f0\u6e7e\u6709\u5173\u90e8\u95e8\u80fd\u591f\u987a\u5e94\u6c11\u610f\uff0c\u6ee1\u8db3\u6c11\u4f17\u8feb\u5207\u9700\u6c42\uff0c\u4e3a\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u63d0\u4f9b\u4fbf\u5229\u3002</p><p><!-- AD200x300_2 -->\n</p><p></p>", "link": "http://news.163.com/18/0119/18/D8HJ2GAK000187VE.html", "title": ["\u53a6\u822a\u56de\u5e94\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed:\u4e25\u91cd\u5f71\u54cd\u8fd4\u4e61"]}}


--------------------------------------------------------------------------------
/泰迪杯尝试/爬取相似URL/从主页获得相似URL初步可执行代码.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Time    : 2017/3/25 22:50
 4 | # @Author  : mazicwong
 5 | # @File    : 1.爬取相似url(最终).py
 6 | 
 7 | import urllib.request
 8 | import re
 9 | import os
10 | from bs4 import BeautifulSoup
11 | 
12 | 
13 | # 获得主页html
14 | def get_root_html(url):
15 |     # 在主页下面get新的html
16 |     headers = {
17 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
18 |     }
19 |     req = urllib.request.Request(url=url, headers=headers)
20 |     response = urllib.request.urlopen(req, timeout=2)
21 |     html = response.read()
22 |     return html
23 | 
24 | 
25 | def get_re(url):
26 |     url = url[7:]  # 去除http://
27 |     Len = len(url)
28 |     p = "http://"
29 |     i = 0
30 |     while i < Len:
31 |         if url[i] == '.':
32 |             p += '.'
33 |         elif 'a' <= url[i] <= 'z':  # 不能直接判isplpha，因为str[i]中全都是字符
34 |             p += '[a-z]'
35 |         elif '0' <= url[i] <= 'z':
36 |             p += '\d'
37 |         else:
38 |             p += url[i]
39 |         i += 1
40 |     return p
41 | 
42 | 
43 | # 获取该url数据，分为获取本身和相似url
44 | def main():
45 |     with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\bbs_urls.txt", "r") as file:
46 |         urlList = file.readlines()
47 |         cnt = 1
48 |         # path = r'E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果' #用来判断文件是否已经存在
49 |         for url in urlList:
50 |             # if os.path.isfile('out%s.txt'%cnt): #存在且不为空就退出
51 |             #     if os.
52 |             #         cnt +=1
53 |             #         continue
54 | 
55 |             # 以下:get主页url   http://www.baidu.com/abc/cc  ==>>  www.baidu.com
56 |             m = url.split('//')
57 |             if len(m) == 2:
58 |                 root_url = m[1]
59 |             else:
60 |                 root_url = m[0]
61 |             tt = root_url.split('/')
62 |             root_url = tt[0]
63 |             root_url = r'http://' + root_url
64 |             # getHtml(url, cnt)
65 |             # print(root_url)
66 |             root_html = get_root_html(root_url)  # 获得主页html
67 |             p1 = get_re(url)  # 获取正则表达式
68 |             # print(p1)
69 |             # print(type(p1))
70 |             p1 = p1.encode(encoding='utf-8')  # it can help transfer the "string" to "bytes"
71 |             p1 = p1[:-1]  #去掉换行符
72 |             # print(p1)
73 |             # print(type(p1))
74 |             pat = re.compile(p1)  # 编译正则表达式
75 |             List = re.findall(pat, root_html)
76 |             print(len(List))
77 |             # for i in List:
78 |             #     print(i)
79 |             path = r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\out%s.txt" % cnt
80 |             with open(path, "w") as f:
81 |                 for i in List:
82 |                     i = i.decode()
83 |                     i = str(i)
84 |                     f.write(i)
85 |                     f.write('\n')
86 |             cnt += 1
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/003365.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "003365", "comments": {"link": "http://coral.qq.com/2369196525"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/003365.htm", "title": ["\u6e56\u5357\u4e00\u5973\u533b\u751f\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u83b7\u8d5e\u201c\u6700\u7f8e\u201d\uff1a\u5c0f\u75c5\u90fd\u575a\u6301"], "passage": "\u6e56\u5357\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u4e00\u5973\u533b\u751f1\u670818\u65e5\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\uff0c\u83b7\u8d5e\u201c\u5b81\u4e61\u6700\u7f8e\u533b\u751f\u201d\u3002\u5f53\u4e8b\u533b\u751f\u7a0b\u52291\u670819\u65e5\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\uff0c\u201c\u50cf\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u6211\u4eec\u5de5\u4f5c\uff0c\u6211\u4eec\u57fa\u672c\u90fd\u4f1a\u575a\u6301\u4e0a\u73ed\u3002\u201d\u636e\u4e86\u89e3\uff0c\u7a0b\u5229\u662f\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u5987\u4ea7\u79d1\u4e3b\u4efb\uff0c\u5728\u8fd9\u91cc\u5df2\u7ecf\u5de5\u4f5c\u4e86\u4e03\u5e74\u30021\u670818\u65e5\uff0c\u56e0\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u4f46\u53c8\u8f6e\u5230\u503c\u73ed\uff0c\u4e8e\u662f\u5979\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u3002\u636e\u6e56\u5357\u7ecf\u89c6\u6b64\u524d\u62a5\u9053\uff0c\u7a0b\u5229\u5de6\u624b\u6253\u7740\u70b9\u6ef4\uff0c\u53f3\u624b\u62ff\u7740\u7b14\u5199\u5b57\uff0c\u5750\u5728\u529e\u516c\u684c\u524d\u7ed9\u75c5\u4eba\u770b\u75c5\u3002\u6b64\u5916\uff0c\u7a0b\u5229\u4f1a\u7528\u53f3\u624b\u4e3e\u7740\u70b9\u6ef4\u74f6\uff0c\u7136\u540e\u5230\u75c5\u623f\u53bb\u67e5\u623f\uff0c\u8be2\u95ee\u60a3\u8005\u60c5\u51b5\u3002\u4e00\u4f4d\u60a3\u8005\u8bf4\uff1a\u201c\u5979\u4e00\u76f4\u575a\u6301\u5728\u8fd9\u8fb9\uff0c\u4e3a\u6211\u4eec\u75c5\u4eba\u7740\u60f3\uff0c\u6211\u89c9\u5f97\u5979\u662f\u5b81\u4e61\u6700\u7f8e\u7684\u533b\u751f\u3002\u201d\u201c\u56e0\u4e3a\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u6211\u5df2\u7ecf\u6253\u4e86\u56db\u5929\u7684\u70b9\u6ef4\u3002\u521a\u597d\u8fd9\u51e0\u5929\uff0c\u6211\u4eec\u79d1\u5ba4\u6bd4\u8f83\u5fd9\uff0c\u6709\u4e00\u4f4d\u4ea7\u540e\u5927\u51fa\u8840\u7684\u90fd\u5728\u6211\u4eec\u8fd9\u91cc\u62a2\u6551\uff0c\u7d2f\u8fd8\u662f\u6bd4\u8f83\u7d2f\u3002\u6211\u4eec\u5728\u57fa\u5c42\u4e0a\u73ed\uff0c\u4eba\u5458\u90fd\u6bd4\u8f83\u7d27\u5f20\uff0c\u5206\u5de5\u4e5f\u4e0d\u90a3\u4e48\u7ec6\u5316\uff0c\u8981\u505a\u7684\u4e8b\u60c5\u5f88\u591a\uff0c\u50cf\u6211\u4eec\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u5de5\u4f5c\uff0c\u6211\u4eec\u8fd8\u662f\u4f1a\u575a\u6301\u4e0a\u73ed\u7684\u3002\u201d\u7a0b\u5229\u8bf4\u3002\u5bf9\u4e8e\u83b7\u8d5e\u201c\u6700\u7f8e\u533b\u751f\u201d\u79f0\u53f7\uff0c\u7a0b\u5229\u8868\u793a\uff1a\u201c\u6700\u7f8e\u533b\u751f\u771f\u7684\u4e0d\u6562\u5f53\uff0c\u6bcf\u4e00\u4e2a\u804c\u4e1a\u90fd\u6709\u804c\u4e1a\u7684\u672c\u80fd\uff0c\u6211\u4eec\u4e34\u5e8a\u6709\u597d\u591a\u8fd9\u6837\u7684\u533b\u751f\uff0c\u575a\u6301\u4ee5\u75c5\u4eba\u4e3a\u672c\uff0c\u5162\u5162\u4e1a\u4e1a\uff0c\u606a\u5b88\u5c97\u4f4d\u3002\u201d"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/010551.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "010551", "comments": {"link": "http://coral.qq.com/2369832377"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010551.htm", "title": ["\u5e7f\u897f\u5317\u6d77\u8054\u5408\u884c\u52a8\u961f\u6293\u83b747\u540dA\u7ea7\u4f20\u9500\u5934\u76ee \u51bb\u7ed3\u8d854\u5343\u4e07\u5143"], "passage": "\u6628\u5929\uff0819\u65e5\uff09\u51cc\u66683\u70b9\uff0c\u5728\u5e7f\u897f\u5317\u6d77\u5e02\uff0c\u7531\u516c\u5b89\u3001\u5de5\u5546\u3001\u57ce\u7ba1\u7b49\u90e8\u95e8680\u4f59\u540d\u6267\u6cd5\u4eba\u5458\u7ec4\u6210\u8054\u5408\u884c\u52a8\u961f\uff0c\u91cd\u70b9\u6e05\u67e5\u6d89\u5acc\u7ec4\u7ec7\u4f20\u9500\u6d3b\u52a8\u7684\u5934\u76ee\u548c\u4f20\u9500\u9aa8\u5e72\u5206\u5b50\u3001\u53c2\u52a0\u201c\u8d44\u672c\u8fd0\u4f5c\u201d\u3001\u201c\u4e00\u65e5\u6e38\u201d\u7b49\u4f20\u9500\u6d3b\u52a8\u7684\u6d89\u4f20\u4eba\u5458\uff0c\u6b64\u6b21\u4e13\u9879\u884c\u52a8\u5171\u6293\u83b7A\u7ea7\u53ca\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff0c\u51bb\u7ed3\u8d44\u91d1\u7ea64200\u4e07\u5143\u3002\u5f53\u5929\u51cc\u6668\uff0c\u6267\u6cd5\u4eba\u5458\u8fdb\u5165\u5317\u6d77\u5e02\u590f\u65e5\u6d77\u6e7e\u5c0f\u533a\uff0c\u5bf9\u524d\u671f\u6478\u6392\u51fa\u7684\u6d89\u5acc\u4ece\u4e8b\u4f20\u9500\u6d3b\u52a8\u7684100\u591a\u4e2a\u623f\u95f4\u8fdb\u884c\u6e05\u67e5\u6574\u6cbb\u884c\u52a8\u3002\u6b64\u5916\uff0c\u6267\u6cd5\u4eba\u5458\u8fd8\u5206\u522b\u5bf9\u5317\u6d77\u5e02\u533a\u7684\u5317\u6d77\u5723\u7687\u5e7f\u573a\u3001\u6850\u6d0b\u65b0\u57ce\u4e24\u4e2a\u5c0f\u533a\u8fdb\u884c\u6e05\u67e5\u884c\u52a8\u3002\u5171\u6e05\u67e5\u51fa\u79df\u5c4b80\u591a\u95f4\uff0c\u67e5\u83b7\u6d89\u5acc\u4f20\u9500\u4eba\u5458100\u591a\u540d\uff0c\u4ee5\u53ca\u4e00\u6279\u6d89\u5acc\u4f20\u9500\u8fdd\u6cd5\u884c\u4e3a\u7684\u4e66\u7c4d\u548c\u7269\u54c1\u3002\u57fa\u672c\u6bcf\u4e00\u4e2a\u623f\u95f4\u6211\u4eec\u90fd\u6e05\u67e5\u51fa\u6d89\u4f20\u4eba\u5458\uff0c\u4e24\u4e2a\u5c0f\u533a\u4e00\u5171\u6e05\u67e5\u4e86100\u591a\u4e2a\u6d89\u4f20\u4eba\u5458\uff0c\u4e0b\u4e00\u6b65\u6211\u4eec\u6839\u636e\u72af\u7f6a\u7684\uff0c\u6d89\u53ca\u7ec4\u7ec7\u9886\u5bfc\u4f20\u9500\u7f6a\u7684 \u4f9d\u6cd5\u6253\u51fb\uff0c\u6e05\u67e5\u6ca1\u6709\u6784\u6210\u72af\u7f6a\u7684\u6211\u4eec\u7ecf\u8fc7\u6559\u80b2\u3001\u8bad\u8beb\u7136\u540e\u505a\u5176\u4ed6\u76f8\u5e94\u7684\u5904\u7406\u3002\u636e\u4e86\u89e3\uff0c\u5728\u8fd9\u6b21\u4e13\u9879\u884c\u52a8\u4e2d\uff0c\u6267\u6cd5\u4eba\u5458\u9664\u4e86\u5bf9\u4f20\u9500\u4eba\u5458\u805a\u96c6\u8f83\u591a\u7684\u5c0f\u533a\u8fdb\u884c\u5730\u6bef\u5f0f\u6e05\u67e5\u5916\uff0c\u8fd8\u7ec4\u7ec7\u8b66\u529b\u5206\u522b\u5728\u5185\u8499\u53e4\u3001\u4e91\u5357\u3001\u5e7f\u897f\u540c\u65f6\u8fdb\u884c\u6293\u6355\u884c\u52a8\u3002\u622a\u81f3\u6628\u5929\uff0819\u65e5\uff09\u4e0a\u5348\uff0c\u5df2\u6293\u83b7A\u7ea7\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff1b\u5317\u6d77\u5e02\u5171\u6e05\u67e5\u51fa\u79df\u5c4b212\u95f4\uff0c\u67e5\u5c01\u6d89\u4f20\u51fa\u79df\u5c4b148\u6237\uff0c\u67e5\u83b7\u6d89\u4f20\u4eba\u5458459\u540d\u4ee5\u53ca\u5927\u91cf\u624b\u673a\u3001\u7535\u8111\u3001\u4f20\u9500\u4e66\u7c4d\u7b49\u6d89\u6848\u7269\u54c1\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawl project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawl'
13 | 
14 | SPIDER_MODULES = ['crawl.spiders']
15 | NEWSPIDER_MODULE = 'crawl.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'crawl.middlewares.CrawlSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'crawl.middlewares.CrawlDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'crawl.pipelines.CrawlPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tutotial project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'tutotial'
13 | 
14 | SPIDER_MODULES = ['tutotial.spiders']
15 | NEWSPIDER_MODULE = 'tutotial.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'tutotial.middlewares.TutotialSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'tutotial.middlewares.TutotialDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'tutotial.pipelines.TutotialPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | FEED_URI = u'/home/mazic/pp/jian.csv'
92 | FEED_FORMAT = 'CSV'
93 | 


--------------------------------------------------------------------------------
/泰迪杯尝试/数据爬取(去标签).py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/3/25 15:53
  4 | # @Author  : mazicwong
  5 | # @File    : 数据爬取(去标签).py
  6 | 
  7 | # 用正则表达式简单过滤html的标签
  8 | import re
  9 | 
 10 | 
 11 | def filter_tags(htmlstr):
 12 |     re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
 13 |     re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)  # Script
 14 |     re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
 15 |     re_br = re.compile('<br\s*?/?>')  # 处理换行
 16 |     re_h = re.compile('</?\w+[^>]*>')  # HTML标签
 17 |     re_comment = re.compile('<!--[^>]*-->')  # HTML注释
 18 |     s = re_cdata.sub('', htmlstr)  # 去掉CDATA
 19 |     s = re_script.sub('', s)  # 去掉SCRIPT
 20 |     s = re_style.sub('', s)  # 去掉style
 21 |     s = re_br.sub('\n', s)  # 将br转换为换行
 22 |     s = re_h.sub('', s)  # 去掉HTML 标签
 23 |     s = re_comment.sub('', s)  # 去掉HTML注释
 24 |     # 去掉多余的空行
 25 |     blank_line = re.compile('\n+')
 26 |     s = blank_line.sub('\n', s)
 27 |     s = replaceCharEntity(s)  # 替换实体
 28 |     return s
 29 | 
 30 | 
 31 | ##替换常用HTML字符实体.
 32 | # 使用正常的字符替换HTML中特殊的字符实体.
 33 | # 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
 34 | # @param htmlstr HTML字符串.
 35 | def replaceCharEntity(htmlstr):
 36 |     CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
 37 |                      'lt': '<', '60': '<',
 38 |                      'gt': '>', '62': '>',
 39 |                      'amp': '&', '38': '&',
 40 |                      'quot': '"''"', '34': '"', }
 41 | 
 42 |     re_charEntity = re.compile(r'&#?(?P<name>\w+);')
 43 |     sz = re_charEntity.search(htmlstr)
 44 |     while sz:
 45 |         entity = sz.group()  # entity全称，如>
 46 |         key = sz.group('name')  # 去除&;后entity,如>为gt
 47 |         try:
 48 |             htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
 49 |             sz = re_charEntity.search(htmlstr)
 50 |         except KeyError:
 51 |             # 以空串代替
 52 |             htmlstr = re_charEntity.sub('', htmlstr, 1)
 53 |             sz = re_charEntity.search(htmlstr)
 54 |     return htmlstr
 55 | 
 56 | 
 57 | def repalce(s, re_exp, repl_string):
 58 |     return re_exp.sub(repl_string, s)
 59 | 
 60 | 
 61 | '''
 62 | def saveFile(news,cnt):
 63 |     path = r'E:\泰迪杯\C题样例数据\All_html 去标签\out%s.txt' % cnt
 64 |     file = open(path, 'w+')
 65 |     file.write(news)
 66 |     file.close()
 67 | 
 68 | if __name__ == '__main__':
 69 |     for cnt in range(1, 178):
 70 |         try:
 71 |             path1 = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt
 72 |             file = open(path1, 'r')
 73 |             text = file.read()
 74 |             news = filter_tags(text)
 75 |             saveFile(news,cnt)
 76 |             file.close()
 77 |         except:
 78 |             print("第%s文件不存在"%cnt)
 79 | '''
 80 | 
 81 | 
 82 | def saveFile(news, cnt):
 83 |     path = r'E:\泰迪杯\C题样例数据\All_html 相似url\66out%d.txt' % cnt
 84 |     file = open(path, 'w+')
 85 |     file.write(news)
 86 |     file.close()
 87 | 
 88 | 
 89 | #UnicodeDecodeError: 'gbk' codec can't decode byte 0xaf in position 641: illegal multibyte sequence
 90 | #上面在liaoxuefeng提到了，可以直接忽略他
 91 | 
 92 | if __name__ == '__main__':
 93 |     for cnt in [1,-1]:
 94 |         try:
 95 |             path1 = r'E:\泰迪杯\C题样例数据\All_html 相似url\out%s.txt' % cnt
 96 |             #读取一直错误。。改了半个小时终于改成功了
 97 |             #把下面mode = 'r' 改成 'rb', 因为r的时候读进来是gbk..但是也不知道为什么转换不了。。直接读二进制文件吧
 98 |             #明天再把编码问题好好看一看
 99 |             #第二个改动是decode('utf-8')
100 |             file = open(path1, 'rb')
101 |             text = file.read().decode('utf-8')
102 |             news = filter_tags(text)
103 |             saveFile(news, cnt)
104 |             file.close()
105 |         except:
106 |             print("第%s文件不存在" % cnt)
107 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HAH1VS0001875P.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D8HAH1VS0001875P", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HAH1VS0001875P.html"}, "contents": {"title": ["\u7537\u5b50\u6df1\u591c\u5c06\u5973\u5b50\u62b1\u81f3\u575f\u5730\u5f3a\u5978 \u4e8b\u540e\u6b32\u706d\u53e3\u7528\u7816\u7838\u5934"], "link": "http://news.163.com/18/0119/15/D8HAH1VS0001875P.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u6df1\u591c\u730e\u8273\u5f3a\u5978\u5987\u5973 \u62c5\u5fc3\u88ab\u544a\u53d1\u7528\u6c34\u6ce5\u7816\u7838\u5934\u706d\u53e3\uff09\n                </p><p>\u6b63\u4e49\u7f511\u670819\u65e5\u7535 \u201c\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0,\u4f60\u5bf9\u4e00\u5ba1\u5224\u51b3\u8ba4\u5b9a\u7684\u72af\u7f6a\u4e8b\u5b9e\u548c\u8bc1\u636e\u662f\u5426\u6709\u5f02\u8bae?\u201d\u8fd9\u662f\u4e00\u8d77\u7531\u6842\u6797\u5e02\u4eba\u6c11\u68c0\u5bdf\u9662\u4ee5\u5ba1\u5224\u76d1\u7763\u7a0b\u5e8f\u5411\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u63d0\u51fa\u6297\u8bc9\u7684\u66b4\u529b\u5211\u4e8b\u6848\u4ef6\u3002\u4e00\u5ba1\u6cd5\u9662\u4ee5\u5f3a\u5978\u7f6a\u5224\u5904\u949f\u67d0\u67d0\u6709\u671f\u5f92\u5211\u4e09\u5e74\u4e03\u4e2a\u6708\u3001\u4ee5\u6545\u610f\u6740\u4eba\u7f6a\u4ec5\u5224\u5904\u5176\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e00\u5e74\u516d\u4e2a\u6708\u3002</p><p>\u8bf4\u8d77\u8fd9\u8d77\u6848\u4ef6\u90a3\u5c31\u8981\u8ffd\u6eaf\u5230\u51e0\u5e74\u524d\u4e86\u30022015\u5e742\u670825\u65e5\u51cc\u6668,\u56db\u5904\u6e38\u8361\u51c6\u5907\u730e\u8273\u7684\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0\u6ee1\u8138\u5931\u671b,\u9a7e\u9a76\u6469\u6258\u8f66\u6162\u60a0\u60a0\u5730\u5f80\u5bb6\u8d70,\u884c\u81f3\u8354\u6d66\u53bf\u67d0\u9547\u67d0\u8857,\u949f\u67d0\u67d0\u773c\u524d\u7a81\u7136\u4e00\u4eae,\u524d\u9762\u4ece\u9ebb\u5c06\u9986\u51fa\u95e8\u6b63\u72ec\u81ea\u6b65\u884c\u56de\u5bb6\u7684\u9ec4\u67d0\u67d0\u6b63\u9002\u5408\u4e0b\u624b\u554a!</p><p><!-- AD200x300_2 -->\n</p><p>\u949f\u67d0\u67d0\u9042\u8d76\u4e0a\u524d\u4e3b\u52a8\u63d0\u51fa\u642d\u8f7d\u9ec4\u67d0\u67d0\u56de\u5bb6\u3002\u884c\u81f3\u504f\u50fb\u8def\u6bb5\u949f\u67d0\u67d0\u4fbf\u63d0\u51fa\u8981\u4e0e\u9ec4\u67d0\u67d0\u53d1\u751f\u6027\u5173\u7cfb,\u88ab\u62d2\u7edd\u540e\u949f\u67d0\u67d0\u76f4\u63a5\u5c06\u6b32\u9003\u8dd1\u7684\u9ec4\u67d0\u67d0\u6402\u62b1\u81f3\u8def\u8fb9\u575f\u5730,\u4e0d\u987e\u5bd2\u98ce\u51db\u51bd,\u5f3a\u884c\u5bf9\u9ec4\u67d0\u67d0\u5b9e\u65bd\u5978\u6deb\u3002\u4e8b\u6bd5,\u5fc3\u6ee1\u610f\u8db3\u7684\u949f\u67d0\u67d0\u62c5\u5fc3\u9ec4\u67d0\u67d0\u544a\u53d1,\u9042\u51b3\u5b9a\u706d\u53e3\u3002\u5728\u5c06\u9ec4\u67d0\u67d0\u6390\u6655\u540e,\u949f\u67d0\u67d0\u53cc\u624b\u4ece\u575f\u5806\u65c1\u642c\u8d77\u4e00\u5757\u6c34\u6ce5\u7816\u5f84\u76f4\u8fde\u7eed\u7838\u5411\u9ec4\u67d0\u67d0\u7684\u5934\u90e8\u2026\u2026</p><p>\u5341\u4f59\u5c0f\u65f6\u540e\u9ec4\u67d0\u67d0\u88ab\u4eba\u53d1\u73b0\u5e76\u83b7\u6551\u3002\u7ecf\u6cd5\u533b\u9274\u5b9a,\u88ab\u5bb3\u4eba\u9ec4\u67d0\u67d0\u7684\u4eba\u4f53\u635f\u4f24\u7a0b\u5ea6\u6784\u6210\u91cd\u4f24\u4e8c\u7ea7,\u5934\u9762\u90e8\u7684\u4eba\u4f53\u635f\u4f24\u6b8b\u75be\u7a0b\u5ea6\u5c5e\u516d\u7ea7\u6b8b\u75be\u3002</p><p>\u5ead\u540e\u4e00\u5468,\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u4f5c\u51fa\u7ec8\u5ba1\u5224\u51b3,\u7ef4\u6301\u539f\u5ba1\u6cd5\u9662\u5bf9\u949f\u67d0\u67d0\u5f3a\u5978\u7f6a\u7684\u91cf\u5211,\u5c06\u5176\u6545\u610f\u6740\u4eba\u7f6a\u7684\u91cf\u5211\u7531\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708\u6539\u5224\u4e3a\u6709\u671f\u5f92\u5211\u5341\u4e94\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e03\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74\u3002</p><p></p>"}, "cmtId": "D8HAH1VS0001875P"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20160721/BSH7V8QF00014JB6.json:
--------------------------------------------------------------------------------
1 | {"newsId": "BSH7V8QF00014JB6", "date": "20160721", "source": "netease", "comments": {"link": "http://comment.news.163.com/news3_bbs/BSH7V8QF00014JB6.html"}, "contents": {"title": ["\u8fbd\u5b81\u906d\u66b4\u96e8\u4fb5\u88ad\u81f4\u57ce\u5e02\u5185\u6d9d \u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u4eba"], "link": "http://news.163.com/16/0721/19/BSH7V8QF00014JB6.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u8fbd\u5b81\u906d\u9047\u66b4\u96e8\u4fb5\u88ad\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\uff09\n                </p><p>\u4e2d\u65b0\u793e\u6c88\u96337\u670821\u65e5\u7535 2016\u5e74\u5165\u6c5b\u4ee5\u6765\u6700\u5f3a\u964d\u96e821\u65e5\u4fb5\u88ad\u8fbd\u5b81\uff0c\u9020\u6210\u519c\u7530\u53d7\u707e\u57ce\u5e02\u5185\u6d9d\uff0c\u8be5\u7701\u5df2\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\u3002</p><p>\u4ece7\u670820\u65e5\u665a\u5f00\u59cb\uff0c\u672c\u8f6e\u5927\u8303\u56f4\u66b4\u96e8\u5f00\u59cb\u5728\u8fbd\u5b81\u897f\u90e8\u5730\u533a\u8086\u8650\uff0c\u6cbf\u6d77\u90e8\u5206\u6cb3\u6d41\u53d1\u751f\u6d2a\u6c34\uff0c\u81f321\u65e5\u964d\u96e8\u8303\u56f4\u6269\u6563\u5230\u8fbd\u5b81\u5168\u5883\u3002</p><p>\u4e2d\u65b0\u793e\u8bb0\u800521\u65e5\u5728\u7701\u4f1a\u6c88\u9633\u770b\u5230\uff0c\u5929\u7a7a\u9634\u6c89\u72b9\u5982\u508d\u665a\uff0c\u5927\u96e8\u503e\u76c6\u800c\u4e0b\uff0c\u5728\u4e00\u4e9b\u79ef\u6c34\u4e25\u91cd\u7684\u8857\u8def\u4e0a\uff0c\u6d88\u9632\u4eba\u5458\u51fa\u52a8\u76ae\u5212\u8247\u8fd0\u8f7d\u53d7\u56f0\u6c11\u4f17\u3002\u5728\u846b\u82a6\u5c9b\u5e02\uff0c\u90e8\u5206\u5730\u533a\u964d\u96e8\u91cf\u7a81\u7834\u6709\u6c14\u8c61\u8bb0\u5f55\u4ee5\u6765\u7684\u5386\u53f2\u6781\u503c\uff0c\u4e0d\u65ad\u6709\u8f66\u8f86\u5728\u79ef\u6c34\u91cc\u629b\u951a\uff0c\u5f53\u5730\u8fb9\u9632\u5b98\u5175\u8fde\u591c\u8f6c\u79fb\u4e8688\u540d\u8f96\u533a\u6c11\u4f17\u3002</p><p>\u636e\u8fbd\u5b81\u7701\u9632\u6c5b\u6297\u65f1\u6307\u6325\u90e8\u4ecb\u7ecd\uff0c\u622a\u81f3\u76ee\u524d\uff0c\u6c14\u8c61\u90e8\u95e8\u5df2\u63a5\u8fde\u53d1\u5e03\u66b4\u96e8\u7ea2\u8272\u9884\u8b667\u4e2a\uff0c\u66b4\u96e8\u6a59\u8272\u9884\u8b6616\u4e2a\uff0c\u5168\u7701\u6700\u5927\u964d\u6c34\u91cf\u51fa\u73b0\u5728\u846b\u82a6\u5c9b\u5e02\u7ee5\u4e2d\u53bf\uff0c\u8fbe\u5230396\u6beb\u7c73\u3002</p><p><!-- AD200x300_2 -->\n</p><p>\u53d7\u5f3a\u964d\u96e8\u5f71\u54cd\uff0c\u622a\u81f37\u670821\u65e515\u65f6\u8bb8\uff0c\u8fbd\u5b81\u5168\u7701\u8d85\u6c5b\u9650\u6c34\u4f4d\u8fd0\u884c\u7684\u6c34\u5e93\u670930\u5ea7\uff0c\u5176\u4e2d\u5927\u4e2d\u578b\u6c34\u5e933\u5ea7\u300221\u65e5\uff0c\u8fbd\u5b8130\u5ea7\u5927\u578b\u6c34\u5e93\u603b\u84c4\u6c34\u91cf\u4e3a33.41\u4ebf\u7acb\u65b9\u7c73\uff0c\u6bd42015\u5e74\u540c\u671f\u591a5.76\u4ebf\u7acb\u65b9\u7c73\u3002</p><p>\u76ee\u524d\uff0c\u8fbd\u5b81\u846b\u82a6\u5c9b\u5e02\u670925\u4e2a\u4e61\u9547\u53d7\u707e\uff0c\u5012\u584c\u623f\u5c4b28\u95f4\uff0c\u519c\u4f5c\u7269\u53d7\u707e\u9762\u79ef39.2\u4e07\u4ea9\uff0c\u635f\u6bc1\u5824\u96320.8\u516c\u91cc\uff0c\u76f4\u63a5\u7ecf\u6d4e\u635f\u59311900\u4e07\u5143\u4eba\u6c11\u5e01\u3002\u5176\u4ed6\u5730\u533a\u707e\u60c5\u6b63\u5728\u8fdb\u4e00\u6b65\u6838\u5b9e\u4e2d\u3002\u672c\u8f6e\u66b4\u96e8\u8fbd\u5b81\u5171\u8f6c\u79fb12\u4e2a\u5e02\u7684\u6c11\u4f1712.59\u4e07\u4eba\uff0c\u6682\u65f6\u6ca1\u6709\u6536\u5230\u4eba\u5458\u4f24\u4ea1\u62a5\u544a\u3002</p><p>\u7a81\u5982\u5176\u6765\u7684\u66b4\u96e8\u4ea6\u4f7f\u4ea4\u901a\u51fa\u884c\u53d7\u5230\u4e25\u91cd\u5f71\u54cd\uff0c\u8fbd\u5b81\u5883\u518516\u6761\u9ad8\u901f\u516c\u8def\u5c01\u95ed\u6216\u9650\u884c\uff1b39\u8d9f\u65c5\u5ba2\u5217\u8f66\u4e34\u65f6\u505c\u8fd0\uff1b\u6cbf\u6d77\u6e2f\u53e3\u53d7\u5927\u98ce\u5f71\u54cd\u90e8\u5206\u73ed\u6b21\u505c\u822a\u3002</p><p>\u6c14\u8c61\u90e8\u95e8\u9884\u8ba1\uff0c22\u65e5\u8fbd\u4e1c\u5730\u533a\u7684\u672c\u6eaa\u3001\u4e39\u4e1c\u7b49\u5730\u8fd8\u5c06\u7ee7\u7eed\u906d\u9047\u66b4\u96e8\u3002</p><p></p>"}, "cmtId": "BSH7V8QF00014JB6"}


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for jnuxshc project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'jnuxshc'
 13 | 
 14 | SPIDER_MODULES = ['jnuxshc.spiders']
 15 | NEWSPIDER_MODULE = 'jnuxshc.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | #CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | #DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | #CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | #COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | #TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | #DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | #}
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 48 | #SPIDER_MIDDLEWARES = {
 49 | #    'jnuxshc.middlewares.JnuxshcSpiderMiddleware': 543,
 50 | #}
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 54 | #DOWNLOADER_MIDDLEWARES = {
 55 | #    'jnuxshc.middlewares.JnuxshcDownloaderMiddleware': 543,
 56 | #}
 57 | 
 58 | # Enable or disable extensions
 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 60 | #EXTENSIONS = {
 61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 62 | #}
 63 | 
 64 | # Configure item pipelines
 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 66 | #ITEM_PIPELINES = {
 67 | #    'jnuxshc.pipelines.JnuxshcPipeline': 300,
 68 | #}
 69 | 
 70 | # Enable and configure the AutoThrottle extension (disabled by default)
 71 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 72 | #AUTOTHROTTLE_ENABLED = True
 73 | # The initial download delay
 74 | #AUTOTHROTTLE_START_DELAY = 5
 75 | # The maximum download delay to be set in case of high latencies
 76 | #AUTOTHROTTLE_MAX_DELAY = 60
 77 | # The average number of requests Scrapy should be sending in parallel to
 78 | # each remote server
 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 80 | # Enable showing throttling stats for every response received:
 81 | #AUTOTHROTTLE_DEBUG = False
 82 | 
 83 | # Enable and configure HTTP caching (disabled by default)
 84 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 85 | #HTTPCACHE_ENABLED = True
 86 | #HTTPCACHE_EXPIRATION_SECS = 0
 87 | #HTTPCACHE_DIR = 'httpcache'
 88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 90 | FEED_URI = u'./jnu.csv'
 91 | FEED_FORMAT = 'CSV'
 92 | 
 93 | FEED_EXPORTERS = {                                                        
 94 |     'csv': 'jnuxshc.spiders.csv_item_exporter.MyProjectCsvItemExporter',   
 95 | } #jnuxshc为工程名
 96 | FIELDS_TO_EXPORT = [
 97 |     'time',
 98 |     'title',
 99 |     'intro'
100 | ]
101 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20160418/023091.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "023091", "comments": {"link": "http://coral.qq.com/1373761671"}, "date": "20160418", "contents": {"link": "https://news.qq.com/a/20160418/023091.htm", "title": ["\u5df4\u897f\u4f17\u9662\u5f39\u52be\u603b\u7edf\u6848\u83b7\u901a\u8fc7 \u7f57\u585e\u592b\u653f\u515a\u627f\u8ba4\u843d\u8d25"], "passage": "\n\n\n\n\n\n\n\n\n\r\n\r\n\r\n\r\n  \r\n\r\n\r\n\u4e2d\u65b0\u7f514\u670818\u65e5\u7535 \u7efc\u5408\u5916\u5a92\u62a5\u9053\uff0c\u5df4\u897f\u4f17\u8bae\u966217\u65e5\u9488\u5bf9\u662f\u5426\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u8fdb\u884c\u8868\u51b3\uff0c\u5230\u76ee\u524d\u4e3a\u6b62\uff0c513\u540d\u8bae\u5458\u4e2d\u5df2\u6709\u81f3\u5c11342\u540d\u8bae\u5458\u5bf9\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u6295\u4e86\u8d5e\u6210\u7968\uff0c\u8fd9\u610f\u5473\u7740\u5f39\u52be\u6848\u5728\u4f17\u9662\u83b7\u5f97\u901a\u8fc7\uff0c\u5f39\u52be\u603b\u7edf\u7a0b\u5e8f\u5c06\u7ee7\u7eed\u3002\u5f39\u52be\u62a5\u544a\u5c06\u9012\u4ea4\u7ed9\u53c2\u8bae\u9662\u505a\u51fa\u6700\u7ec8\u8868\u51b3\u3002\u800c\u7f57\u585e\u592b\u6240\u5c5e\u653f\u515a\u8868\u793a\u5927\u52bf\u5df2\u53bb\uff0c\u65e0\u6cd5\u907f\u514d\u603b\u7edf\u906d\u5f39\u52be\u3002\u62a5\u9053\u79f0\uff0c\u5df4\u897f\u6267\u653f\u515a\u52b3\u5de5\u515a\u515a\u56e2\u9886\u8896\u5b63\u9a6c\u745e\u65af\u4e5f\u8868\u793a\uff0c\u5bf9\u4f17\u8bae\u9662\u5f39\u52be\u7f57\u585e\u592b\u7684\u8868\u51b3\u627f\u8ba4\u5931\u8d25\u3002\u4ed6\u5728\u4f17\u9662\u53d7\u8bbf\u8bf4\uff1a\u201c\u73b0\u5728\u8981\u5728\u53c2\u9662\u7eed\u6218\u4e86\u3002\u201d\u62a5\u9053\u6307\u51fa\uff0c\u6839\u636e\u5df4\u897f\u6cd5\u5f8b\uff0c\u4e3b\u5f20\u5f39\u52be\u4e00\u65b9\u5fc5\u987b\u5728\u6b64\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u4e09\u5206\u4e4b\u4e8c\u7684\u6295\u7968\uff0c\u5373\u5728513\u5f20\u6295\u7968\u4e2d\u4e89\u53d6\u5230342\u7968\uff0c\u624d\u80fd\u5c06\u52a8\u8bae\u63d0\u4ea4\u5230\u53c2\u8bae\u9662\uff0c\u5e76\u7531\u53c2\u8bae\u9662\u51b3\u5b9a\u603b\u7edf\u662f\u5426\u4ece\u4e8b\u4e86\u975e\u6cd5\u884c\u4e3a\u3002\u5f39\u52be\u6848\u5728\u4f17\u9662\u901a\u8fc7\u540e\uff0c\u53c2\u8bae\u9662\u5c06\u5bf9\u5176\u8fdb\u884c\u9996\u8f6e\u8868\u51b3\uff0c\u65f6\u95f4\u53ef\u80fd\u57285\u6708\u3002\u5982\u679c\u53c2\u8bae\u9662\u5728\u9996\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u7b80\u5355\u591a\u6570\u652f\u6301\uff0c\u7f57\u585e\u592b\u987b\u79bb\u804c180\u5929\uff0c\u5176\u95f4\u603b\u7edf\u4e00\u804c\u7531\u526f\u603b\u7edf\u4ee3\u7406\u3002\u53c2\u8bae\u9662\u4e4b\u540e\u5c06\u542c\u53d6\u8bc1\u636e\uff0c\u518d\u8fdb\u884c\u7b2c\u4e8c\u8f6e\u8868\u51b3\uff0c\u5982\u679c2/3\u4ee5\u4e0a\u7684\u8bae\u5458\u652f\u6301\u5f39\u52be\uff0c\u5219\u7f57\u585e\u592b\u4e0b\u53f0\uff0c\u526f\u603b\u7edf\u7279\u6885\u5c14\u63a5\u4efb\uff1b\u5982\u679c\u53c2\u8bae\u9662\u652f\u6301\u5f39\u52be\u7684\u8bae\u5458\u4e0d\u52302/3\uff0c\u7f57\u585e\u592b\u6062\u590d\u603b\u7edf\u804c\u4f4d\u3002\u62a5\u9053\u79f0\uff0c\u56e0\u4e3a\u5df4\u897f\u53c2\u8bae\u9662\u548c\u4f17\u8bae\u9662\u7684\u6784\u6210\u6781\u4e3a\u76f8\u4f3c\uff0c\u6240\u4ee5\u53c2\u8bae\u9662\u53ef\u80fd\u5f97\u51fa\u4e0e\u4f17\u8bae\u9662\u76f8\u540c\u7684\u7ed3\u8bba\u3002\u5982\u679c\u7f57\u585e\u592b\u6700\u7ec8\u88ab\u5f39\u52be\u4e0b\u53f0\uff0c\u7279\u6885\u5c14\u5c06\u63a5\u4efb\u603b\u7edf\u804c\u4f4d\uff0c\u4f46\u662f\u56e0\u4e3a\u7279\u6885\u5c14\u4e5f\u5377\u5165\u8d2a\u8150\u6848\u4ef6\u4e2d\uff0c\u7f57\u585e\u592b\u7684\u652f\u6301\u8005\u5df2\u7ecf\u5f00\u59cb\u5bf9\u4ed6\u8fdb\u884c\u5f39\u52be\u884c\u52a8\u3002\u8fd9\u4e5f\u5c31\u610f\u5473\u7740\uff0c\u5728\u4eca\u5e748\u67085\u65e5\u81f321\u65e5\u5df4\u897f\u9996\u6b21\u4e3e\u884c\u590f\u5b63\u5965\u8fd0\u4f1a\u65f6\uff0c\u5176\u653f\u5c40\u4ecd\u7136\u5728\u6df7\u4e71\u4e4b\u4e2d\u3002\u636e\u6089\uff0c\u4ece\u5df4\u897f\u5f53\u5730\u65f6\u95f44\u670815\u65e5\u65e9\u4e0a\u5f00\u59cb\u4e00\u76f4\u523017\u65e5\u6e05\u6668\uff0c\u6709120\u540d\u8bae\u5458\u53c2\u52a0\u4e86\u5173\u4e8e\u662f\u5426\u5f39\u52be\u7f57\u585e\u592b\u7684\u8fa9\u8bba\uff0c\u8fa9\u8bba\u65f6\u95f4\u8d85\u8fc743\u4e2a\u5c0f\u65f6\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20161227/011065.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "011065", "comments": {"link": "http://coral.qq.com/1687646782"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/011065.htm", "title": ["\u5c0f\u4f19\u51fa\u5dee\u90d1\u5dde\u9047\u96fe\u973e\u8bc9\u653f\u5e9c\u88ab\u9a73\uff1a\u5e94\u5148\u7533\u8bf7\u653f\u5e9c\u8d54\u507f\u53e3\u7f69\u94b1"], "passage": "\u56e0\u51fa\u5dee\u90d1\u5dde\u53d1\u73b0\u5f53\u5730\u96fe\u973e\u4e25\u91cd\uff0c\u65e5\u524d\uff0c\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u9a73\u56de\u539f\u544a\u8d77\u8bc9\uff0c\u7406\u7531\u662f\u5176\u8d77\u8bc9\u524d\u5e76\u672a\u5411\u90d1\u5dde\u5e02\u653f\u5e9c\u63d0\u51fa\u8fc7\u8d54\u507f\u7533\u8bf7\u3002\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u884c\u653f\u8d54\u507f\u88c1\u5b9a\u4e66\u3002\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\u83b7\u5f97\u7684\u88c1\u5b9a\u4e66\u663e\u793a\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u4f9d\u636e\u300a\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u8d54\u507f\u6cd5\u300b\u7b2c\u4e5d\u6761\u7b2c\u4e8c\u6b3e\u89c4\u5b9a\uff0c\u6b64\u5916\uff0c\u4f9d\u636e\u300a\u6700\u9ad8\u4eba\u6c11\u6cd5\u9662\u5173\u4e8e\u5ba1\u7406\u884c\u653f\u8d54\u507f\u6848\u4ef6\u82e5\u5e72\u95ee\u9898\u7684\u89c4\u5b9a\u300b\u7b2c\u56db\u6761\u7b2c\u4e8c\u6b3e\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u539f\u544a\u5728\u63d0\u8d77\u8bc9\u8bbc\u524d\uff0c\u5176\u8d54\u507f\u8bf7\u6c42\u5c1a\u672a\u7ecf\u8fc7\u90d1\u5dde\u5e02\u4eba\u6c11\u653f\u5e9c\u5148\u884c\u5904\u7406\u3002\u56e0\u6b64\uff0c\u6cd5\u9662\u5e94\u5f53\u9a73\u56de\u539f\u544a\u8d77\u8bc9\u300212\u670826\u65e5\u665a\uff0c\u5b59\u6d2a\u5f6c\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u4ed6\u4e8e26\u65e5\u5f53\u5929\u6536\u5230\u4e86\u8be5\u88c1\u5b9a\u4e66\u3002\u5b59\u6d2a\u5f6c\u8bf4\uff0c\u8be5\u88c1\u5b9a\u5728\u4ed6\u7684\u610f\u6599\u4e4b\u4e2d\u3002\u201c\u73b0\u5728\u8fd8\u6ca1\u51b3\u5b9a\u8981\u4e0d\u8981\u4e0a\u8bc9\uff0c\u4f46\u662f\u54a8\u8be2\u4e86\u5f8b\u5e08\u4e5f\u8bf4\u4e0a\u8bc9\u4e5f\u6ca1\u6709\u610f\u4e49\uff0c\u4f30\u8ba1\u4e0d\u4f1a\u7ee7\u7eed\uff08\u4e0a\u8bc9\uff09\u4e86\u201d\u3002\u6f8e\u6e43\u65b0\u95fb\u6ce8\u610f\u5230\uff0c\u300a\u8d54\u507f\u6cd5\u300b\u89c4\u5b9a\uff0c\u8d54\u507f\u4e49\u52a1\u673a\u5173\u53ef\u4ee5\u5728\u4e24\u4e2a\u6708\u5185\u505a\u51fa\u662f\u5426\u8d54\u507f\u7684\u51b3\u5b9a\u3002\u4ed6\u8bf4\uff0c\u5728\u5411\u5e02\u653f\u5e9c\u63d0\u51fa\u8d54\u507f\u7533\u8bf7\u4e4b\u540e\uff0c\u81ea\u5df1\u53c8\u5411\u6cd5\u9662\u9012\u4ea4\u4e86\u53e6\u5916\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u8981\u6c42\u786e\u8ba4\u90d1\u5dde\u5e02\u653f\u5e9c\u6cbb\u973e\u4e0d\u4f5c\u4e3a\uff0c\u672a\u4e25\u683c\u5c65\u884c\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u5b9a\u804c\u8d23\uff0c\u201c\u73b0\u5728\u4e3b\u8981\u770b\u8fd9\u4e2a\u8bc9\u8bbc\u80fd\u5426\u7acb\u6848\u4e86\u3002\u201d\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c11\u670820\u65e5\uff0c\u5b59\u6d2a\u5f6c\u5728\u90d1\u5dde\u51fa\u5dee\u65f6\uff0c\u5728\u8be5\u5e02\u5730\u6807\u5efa\u7b51\u4e8c\u4e03\u5854\u9644\u8fd1\u611f\u89c9\u201c\u7279\u522b\u545b\u201d\uff0c\u4ed6\u4fbf\u4e70\u4e86\u4e00\u526f\u4ef7\u503c32\u5143\u7684\u9632\u973e\u53e3\u7f69\u3002\u5f53\u5929\u90d1\u5dde\u5e02AQI\u4e3a253\uff0c\u5c5e\u4e8e\u91cd\u5ea6\u6c61\u67d3\u3002\u5f53\u665a\uff0c\u5b59\u6d2a\u5f6c\u62df\u51fa\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u79f0\u4f9d\u636e\u300a\u73af\u5883\u4fdd\u62a4\u6cd5\u300b\u53ca\u300a\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u300b\u89c4\u5b9a\uff0c\u90d1\u5dde\u5e02\u653f\u5e9c\u5e94\u5bf9\u672c\u884c\u653f\u533a\u57df\u7684\u73af\u5883\u8d28\u91cf\u8d1f\u8d23\u3002\u5b59\u6d2a\u5f6c\u8bf7\u6c42\u4f9d\u6cd5\u5224\u4ee4\u88ab\u544a\u8d54\u507f\u572811\u670820\u65e5\u90d1\u5dde\u96fe\u973e\u671f\u95f4\u7684\u53e3\u7f69\u8d2d\u4e70\u8d39\u7528\uff0c\u5e76\u5224\u4ee4\u88ab\u544a\u627f\u62c5\u672c\u6848\u8bc9\u8bbc\u8d39\u3002\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u66fe\u4e8e11\u670825\u65e5\u7ec4\u6210\u4e86\u5408\u8bae\u5ead\uff0c\u53d7\u7406\u6b64\u6848\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class CrawlSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CrawlDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class JnuxshcSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class JnuxshcDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/机器学习入门/label_propagation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/3/25 11:28
  4 | # @Author  : mazicwong
  5 | # @File    : label_propagation.py
  6 | 
  7 | import time
  8 | import numpy as np
  9 | 
 10 | 
 11 | # return k neighbors index
 12 | def navie_knn(dataSet, query, k):
 13 |     numSamples = dataSet.shape[0]
 14 | 
 15 |     ## step 1: calculate Euclidean distance
 16 |     diff = np.tile(query, (numSamples, 1)) - dataSet
 17 |     squaredDiff = diff ** 2
 18 |     squaredDist = np.sum(squaredDiff, axis=1)  # sum is performed by row
 19 | 
 20 |     ## step 2: sort the distance
 21 |     sortedDistIndices = np.argsort(squaredDist)
 22 |     if k > len(sortedDistIndices):
 23 |         k = len(sortedDistIndices)
 24 | 
 25 |     return sortedDistIndices[0:k]
 26 | 
 27 | 
 28 | # build a big graph (normalized weight matrix)
 29 | def buildGraph(MatX, kernel_type, rbf_sigma=None, knn_num_neighbors=None):
 30 |     num_samples = MatX.shape[0]
 31 |     affinity_matrix = np.zeros((num_samples, num_samples), np.float32)
 32 |     if kernel_type == 'rbf':
 33 |         if rbf_sigma == None:
 34 |             raise ValueError('You should input a sigma of rbf kernel!')
 35 |         for i in range(num_samples):
 36 |             row_sum = 0.0
 37 |             for j in range(num_samples):
 38 |                 diff = MatX[i, :] - MatX[j, :]
 39 |                 affinity_matrix[i][j] = np.exp(sum(diff ** 2) / (-2.0 * rbf_sigma ** 2))
 40 |                 row_sum += affinity_matrix[i][j]
 41 |             affinity_matrix[i][:] /= row_sum
 42 |     elif kernel_type == 'knn':
 43 |         if knn_num_neighbors == None:
 44 |             raise ValueError('You should input a k of knn kernel!')
 45 |         for i in range(num_samples):
 46 |             k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors)
 47 |             affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors
 48 |     else:
 49 |         raise NameError('Not support kernel type! You can use knn or rbf!')
 50 | 
 51 |     return affinity_matrix
 52 | 
 53 | 
 54 | # label propagation
 55 | def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='rbf', rbf_sigma=1.5, \
 56 |                      knn_num_neighbors=10, max_iter=500, tol=1e-3):
 57 |     # initialize
 58 |     num_label_samples = Mat_Label.shape[0]
 59 |     num_unlabel_samples = Mat_Unlabel.shape[0]
 60 |     num_samples = num_label_samples + num_unlabel_samples
 61 |     labels_list = np.unique(labels)
 62 |     num_classes = len(labels_list)
 63 | 
 64 |     MatX = np.vstack((Mat_Label, Mat_Unlabel))
 65 |     clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)
 66 |     for i in range(num_label_samples):
 67 |         clamp_data_label[i][labels[i]] = 1.0
 68 | 
 69 |     label_function = np.zeros((num_samples, num_classes), np.float32)
 70 |     label_function[0: num_label_samples] = clamp_data_label
 71 |     label_function[num_label_samples: num_samples] = -1
 72 | 
 73 |     # graph construction
 74 |     affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)
 75 | 
 76 |     # start to propagation
 77 |     iter = 0;
 78 |     pre_label_function = np.zeros((num_samples, num_classes), np.float32)
 79 |     changed = np.abs(pre_label_function - label_function).sum()
 80 |     while iter < max_iter and changed > tol:
 81 |         if iter % 1 == 0:
 82 |             print
 83 |             "---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed)
 84 |         pre_label_function = label_function
 85 |         iter += 1
 86 | 
 87 |         # propagation
 88 |         label_function = np.dot(affinity_matrix, label_function)
 89 | 
 90 |         # clamp
 91 |         label_function[0: num_label_samples] = clamp_data_label
 92 | 
 93 |         # check converge
 94 |         changed = np.abs(pre_label_function - label_function).sum()
 95 | 
 96 |         # get terminate label of unlabeled data
 97 |     unlabel_data_labels = np.zeros(num_unlabel_samples)
 98 |     for i in range(num_unlabel_samples):
 99 |         unlabel_data_labels[i] = np.argmax(label_function[i + num_label_samples])
100 | 
101 |     return unlabel_data_labels
102 | 


--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TutotialSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class TutotialDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20171129/013590.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "013590", "comments": {"link": "http://coral.qq.com/2259249504"}, "date": "20171129", "contents": {"link": "https://news.qq.com/a/20171129/013590.htm", "title": ["\u8054\u901a\u7545\u6e38\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u8bed\u97f3\u3001\u6d41\u91cf\u5168\u56fd\u7545\u723d\u4f7f\u7528"], "passage": "\u4e2d\u56fd\u8054\u901a\u6b63\u5f0f\u63a8\u51fa\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u542b\u8d85\u5927\u6d41\u91cf\u3001\u8d85\u591a\u8bed\u97f3\uff0c\u53ef\u5728\u5168\u56fd\u8303\u56f4\u5185\u7545\u723d\u4f7f\u7528\u3002\u4e0d\u9650\u6d41\u91cf\u3001\u4e0d\u9650\u8bed\u97f3\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u5c06\u7ed9\u7528\u6237\u5e26\u6765\u590f\u5929\u4eab\u53d7\u51b0\u6fc0\u51cc\u4e00\u6837\u7684\u7545\u723d\u611f\uff0c\u662f\u4e00\u6b3e\u5f70\u663e\u8054\u901a\u4e2a\u6027\u7684\u4ea7\u54c1\uff0c\u4f5c\u4e3a\u4e1a\u754c\u9996\u521b\uff0c\u51b0\u6fc0\u51cc\u5957\u9910\u4ea7\u54c1\u663e\u793a\u4e86\u4e2d\u56fd\u8054\u901a\u4e00\u76f4\u4ee5\u6765\u4fdd\u6301\u7740\u7684\u6d3b\u529b\u4e0e\u521b\u65b0\u3002\u6d41\u91cf\u4e0d\u9650\u91cf\uff0c\u7d27\u8ddf\u5f53\u4e0b\u5e74\u8f7b\u4eba\u7231\u8ffd\u5267\u3001\u7231\u76f4\u64ad\u7684\u6d88\u8d39\u4e60\u60ef\uff0c\u540c\u65f6\u6ee1\u8db3\u7ecf\u5e38\u51fa\u5dee\u3001\u65c5\u6e38\u7b49\u5546\u65c5\u4eba\u58eb\u7684\u9700\u6c42\u3002\u901a\u8bdd\u4e0d\u9650\u91cf\uff0c\u8ba9\u7528\u6237\u4e0e\u5bb6\u4eba\u670b\u53cb\u8fdb\u884c\u901a\u8bdd\u65f6\uff0c\u4e0d\u518d\u957f\u8bdd\u77ed\u8bf4\uff0c\u5b9e\u73b0\u771f\u6b63\u610f\u4e49\u4e0a\u7684\u7545\u723d\u804a\u5929\uff0c\u5168\u56fd\u901a\u7528\uff0c\u65e0\u6f2b\u6e38\u3001\u957f\u9014\u8d39\u7528\u4ea7\u751f\u3002\u73b0\u767b\u5f55\u8054\u901a\u7f51\u4e0a\u8425\u4e1a\u5385\uff0c\u5373\u53ef\u9996\u670899\u5143\u4eab\u53d7\u4e0d\u9650\u91cf\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff1b\u9884\u5b5899\u5143\u9001100\u5143\uff0c\u6708\u8d39\u6c38\u4e455\u6298\uff08\u539f\u4ef7398\uff0c\u73b0\u4ec5\u9700\u6708\u8d39199\uff09\uff1b\u4ec5\u9650\u8054\u901a\u7f51\u4e0a\u5546\u57ce\u529e\u7406\u7528\u6237\u3002\u751f\u65e5\u53f7\u3001\u60c5\u4fa3\u53f7\u7b49\u968f\u610f\u9009\uff0c\u8ba9\u4f60\u7684\u624b\u673a\u53f7\u4e0d\u518d\u662f\u51b7\u51b0\u51b0\u7684\u4e00\u7ec4\u6570\u5b57\u3002\u4e2d\u56fd\u8054\u901a\u4ee5\u7528\u6237\u5229\u76ca\u4e3a\u6838\u5fc3\uff0c\u5df2\u5b8c\u6210\u4e00\u7cfb\u5217\u521b\u65b0\u52a8\u4f5c\uff0c\u6b64\u524d\uff0c\u8054\u5408\u4e92\u8054\u7f51\u516c\u53f8\u63a8\u51fa\u4e86\u8682\u8681\u5b9d\u5361\u3001\u817e\u8baf\u738b\u5361\u7b49\u521b\u65b0\u4ea7\u54c1\uff0c\u6b64\u6b21\uff0c\u63a8\u51fa\u7684\u5168\u56fd\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c \u4e5f\u662f\u54cd\u5e94\u56fd\u5bb6\u63d0\u901f\u964d\u8d39\u653f\u7b56\uff0c\u8df5\u884c\u201c\u6d41\u91cf\u653e\u5fc3\u7528\u201d\u7684\u53c8\u4e00\u529b\u4e3e\u3002\u672a\u6765\uff0c\u4e2d\u56fd\u8054\u901a\u5c06\u628a\u51b0\u6fc0\u51cc\u5957\u9910\u4f5c\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u6807\u6746\uff0c\u4ee5\u96f6\u6346\u7ed1\u3001\u6d41\u91cf\u8d85\u591a\u3001\u64cd\u4f5c\u7b80\u5355\u3001\u65b9\u4fbf\u7528\u6237\u4f7f\u7528\u7b49\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u539f\u5219\uff0c\u63a8\u51fa\u66f4\u591a\u201c\u7c7b\u51b0\u6fc0\u51cc\u5957\u9910\u201d\u4ea7\u54c1\uff0c\u5728\u8bed\u97f3\u3001\u6d41\u91cf\u4eab\u53d7\u8d85\u7ea7\u989d\u5ea6\u7684\u57fa\u7840\u4e0a\uff0c\u5b9e\u73b0\u7ec8\u7aef\u5957\u9910\u4e0d\u6346\u7ed1\u3001\u6863\u4f4d\u968f\u610f\u66f4\u6362\u3001\u5957\u9910\u6863\u4f4d\u7cbe\u7b80\u3001\u65b0\u8001\u7528\u6237\u4f18\u60e0\u540c\u4eab\u7b49\u7279\u70b9\u7684\u4ea7\u54c1\u4f18\u5316\uff0c\u4e3a\u7528\u6237\u5e26\u6765\u66f4\u52a0\u653e\u5fc3\u7684\u4f7f\u7528\u4f53\u9a8c\uff0c\u5e76\u4ece\u591a\u4e2a\u5c42\u9762\u4e30\u5bcc\u8054\u901a\u201c\u6c834G+\u201d\u6781\u901f\u7f51\u7edc\u7684\u5320\u5fc3\u610f\u4e49\u3002\u51b0\u6fc0\u51cc\u5957\u9910\u5df2\u5728\u5168\u56fd\u8303\u56f4\u5185\u9646\u7eed\u4e0a\u5e02\u53d1\u552e\uff0c\u8be6\u8be210010\u6216\u54a8\u8be2\u5f53\u5730\u8425\u4e1a\u5385\u3002http://www.10010.com/goodsdetail/111711031180.html\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u817e\u8baf\u7f51\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HIR5JP0001875P.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HIR5JP0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HIR5JP0001875P.html"}, "newsId": "D8HIR5JP0001875P", "contents": {"passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u642d\u8baa\u5973\u751f\u79f0\u5176\u53ef\u5b89\u6392\u5de5\u4f5c \u804c\u6821\u5973\u5b69\u88ab\u9a974000\u5143\uff09\n                </p><p class=\"f_center\"><img alt=\"\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u81ea\u79f0\u53ef\u5b89\u6392\u5de5\u4f5c \u5973\u5b69\u88ab\u9a974000\u5143\" src=\"http://cms-bucket.nosdn.127.net/catchpic/5/5f/5fc1debe0f679ec81462af21d3b926f3.jpg?imageView&amp;thumbnail=550x0\" style=\"margin: 0px auto; display: block;\"></p><p>\u5c01\u9762\u65b0\u95fb\u8baf 1\u670818\u65e5\uff0c\u7ef5\u9633\u67d0\u804c\u682117\u5c81\u5973\u751f\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u7537\u5b50\u642d\u8baa\u81ea\u79f0\u662f\u6559\u80b2\u5c40\u526f\u5c40\u957f\uff0c\u53ef\u4ee5\u4e3a\u5176\u5b89\u6392\u5de5\u4f5c\uff0c\u9a97\u5f97\u5973\u5b69\u4fe1\u4efb\u3002\u4ea4\u8c08\u540e\u8be5\u7537\u5b50\u9a6c\u4e0a\u53c2\u52a0\u8001\u5c40\u957f\u751f\u65e5\u5bb4\uff0c\u9a97\u5f97\u5973\u5b694000\u5143\u751f\u6d3b\u8d39\u3002</p><p>18\u65e5\u4e0b\u53483\u70b9\uff0c\u7279\u5de1\u8b66\u652f\u961f\u5de1\u903b\u4e00\u5927\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff0c\u5728\u9752\u5e74\u5e7f\u573a\u6709\u4e00\u5973\u5b69\u88ab\u9a97\u3002\u6c11\u8b66\u8d76\u5230\u73b0\u573a\u4e86\u89e3\u5230\u5973\u5b69\u59d3\u656c\uff0c\u4eca\u5e7417\u5c81\uff0c\u7ef5\u9633\u67d0\u804c\u6821\u5b66\u751f\uff0c\u5973\u5b69\u54ed\u8bc9\u5979\u88ab\u4e00\u4e2a\u81ea\u79f0\u6559\u80b2\u5c40\u526f\u5c40\u957f\u7684\u9a97\u5b50\u9a97\u8d70\u4e864000\u5143\u3002</p><p>\u201c\u4eca\u5929\u4e0b\u5348\u5979\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u4e2d\u5e74\u7537\u5b50\u548c\u5979\u642d\u8baa\uff0c\u8bf4\u5979\u5f88\u50cf\u540c\u4e8b\u7684\u5973\u513f\uff0c\u8fd8\u8868\u626c\u5979\u957f\u5f97\u6f02\u4eae\uff0c\u7537\u5b50\u53c8\u95ee\u5c0f\u656c\u591a\u5927\u4e86\uff0c\u662f\u5b66\u751f\u5417\uff1f\u5728\u90a3\u4e2a\u5b66\u6821\u4e0a\u5b66\uff1f\u201d\u5c0f\u656c\u544a\u8bc9\u8b66\u65b9\uff0c\u5979\u6ca1\u6709\u9632\u5907\uff0c\u90fd\u4e00\u4e00\u56de\u7b54\uff0c\u63a5\u7740\u7537\u5b50\u8bf4\u81ea\u5df1\u662f\u6559\u80b2\u5c40\u7684\u674e\u526f\u5c40\u957f\uff0c\u7b49\u5c0f\u656c\u6bd5\u4e1a\u4e86\u53ef\u4ee5\u5e2e\u52a9\u5979\u5b89\u6392\u5de5\u4f5c\u3002</p><p><!-- AD200x300_2 -->\n</p><p>\u542c\u8bf4\u53ef\u4ee5\u5b89\u6392\u5de5\u4f5c\uff0c\u5c0f\u656c\u89c9\u5f97\u81ea\u5df1\u9047\u5230\u8d35\u4eba\u4e86\uff0c\u5f7c\u6b64\u76f8\u8c08\u751a\u6b22\u3002\u6b64\u65f6\u8fd9\u540d\u674e\u526f\u5c40\u957f\u8bf4\uff0c\u4ed6\u4e0a\u5348\u521a\u5f00\u5b8c\u4f1a\u8fd9\u4f1a\u8981\u53bb\u53c2\u52a0\u8001\u5c40\u957f\u7684\u751f\u65e5\u5bb4\uff0c\u7531\u4e8e\u6ca1\u5e26\u5361\u6ca1\u6cd5\u53d6\u94b1\uff0c\u8bf7\u5c0f\u656c\u5e2e\u4ed6\u5148\u62ff\u70b9\u94b1\u3002\u201c\u4ed6\u95ee\u6211\u6709\u591a\u5c11\u94b1\uff0c\u6b63\u597d\u8eab\u4e0a\u67094000\u5143\u751f\u6d3b\u8d39\u3002\u201d\u6beb\u65e0\u9632\u5907\u7684\u5973\u5b69\u76f8\u4fe1\u4e86\u526f\u5c40\u957f\u6682\u65f6\u501f\u7528\u4f1a\u8fd8\u94b1\u7684\u8bf4\u6cd5\uff0c\u5c064000\u5143\u94b1\u5168\u90e8\u62ff\u7ed9\u4e86\u4ed6\uff0c\u770b\u7740\u5f88\u5feb\u6d88\u5931\u5728\u4eba\u7fa4\u4e2d\u7684\u526f\u5c40\u957f\uff0c\u5c0f\u656c\u624d\u5f00\u59cb\u6000\u7591\uff0c\u8d8a\u60f3\u8d8a\u4e0d\u5bf9\u52b2\uff0c\u4e8e\u662f\u7acb\u5373\u62a5\u8b66\uff0c\u76ee\u524d\u8b66\u65b9\u5df2\u5c55\u5f00\u8fdb\u4e00\u6b65\u8c03\u67e5\u3002</p><p>\u8b66\u65b9\u63d0\u9192\u5e02\u6c11\uff0c\u9a97\u5b50\u4f1a\u7279\u610f\u7784\u51c6\u90a3\u4e9b\u6d89\u4e8b\u4e0d\u6df1\uff0c\u5584\u826f\u7684\u5c0f\u5973\u5b69\u884c\u9a97\uff0c\u5b66\u6821\u548c\u5bb6\u957f\u8981\u591a\u52a0\u5f3a\u8fd9\u65b9\u9762\u7684\u6559\u80b2\uff0c\u5c0f\u5b69\u81ea\u5df1\u4e5f\u6700\u597d\u4e0d\u8981\u56de\u5e94\u964c\u751f\u4eba\u4e3b\u52a8\u642d\u8baa\u3002</p><p></p>", "link": "http://news.163.com/18/0119/18/D8HIR5JP0001875P.html", "title": ["\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u81ea\u79f0\u53ef\u5b89\u6392\u5de5\u4f5c \u5973\u5b69\u88ab\u9a974000\u5143"]}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180116/D897H80K0001899O.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D897H80K0001899O", "date": "20180116", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D897H80K0001899O.html"}, "contents": {"title": ["\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd"], "link": "http://news.163.com/18/0116/12/D897H80K0001899O.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\uff09\n                </p><p>\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670816\u65e5\u6d88\u606f\uff0c\u56fd\u5bb6\u4e3b\u5e2d\u4e60\u8fd1\u5e7316\u65e5\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\u3002</p><p>\u4e60\u8fd1\u5e73\u6307\u51fa\uff0c\u8fc7\u53bb\u7684\u4e00\u5e74\uff0c\u4e2d\u7f8e\u5173\u7cfb\u603b\u4f53\u4fdd\u6301\u7a33\u5b9a\u5e76\u53d6\u5f97\u91cd\u8981\u8fdb\u5c55\u3002\u4fdd\u6301\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\uff0c\u7b26\u5408\u4e24\u56fd\u548c\u4e24\u56fd\u4eba\u6c11\u5229\u76ca\uff0c\u4e5f\u662f\u56fd\u9645\u793e\u4f1a\u5171\u540c\u671f\u5f85\u3002\u53cc\u65b9\u8981\u4fdd\u6301\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u5145\u5206\u53d1\u63254\u4e2a\u9ad8\u7ea7\u522b\u5bf9\u8bdd\u673a\u5236\u4f5c\u7528\u5e76\u9002\u65f6\u4e3e\u529e\u7b2c\u4e8c\u8f6e\u5bf9\u8bdd\u3002\u4e2d\u7f8e\u7ecf\u8d38\u5408\u4f5c\u7ed9\u4e24\u56fd\u4eba\u6c11\u5e26\u6765\u8bb8\u591a\u5b9e\u5b9e\u5728\u5728\u7684\u5229\u76ca\u3002\u53cc\u65b9\u5e94\u8be5\u91c7\u53d6\u5efa\u8bbe\u6027\u65b9\u5f0f\uff0c\u901a\u8fc7\u5bf9\u5f7c\u6b64\u5f00\u653e\u5e02\u573a\u3001\u505a\u5927\u5408\u4f5c\u86cb\u7cd5\uff0c\u59a5\u5584\u89e3\u51b3\u53cc\u65b9\u5173\u5207\u7684\u7ecf\u8d38\u95ee\u9898\u3002\u8981\u79ef\u6781\u63a8\u8fdb\u4e24\u519b\u3001\u6267\u6cd5\u3001\u7981\u6bd2\u3001\u4eba\u6587\u3001\u5730\u65b9\u7b49\u5408\u4f5c\uff0c\u5c31\u91cd\u5927\u56fd\u9645\u548c\u5730\u533a\u95ee\u9898\u4fdd\u6301\u5bc6\u5207\u6c9f\u901a\u534f\u5546\u3002\u53cc\u65b9\u8981\u76f8\u5411\u800c\u884c\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u805a\u7126\u5408\u4f5c\uff0c\u4ee5\u5efa\u8bbe\u6027\u65b9\u5f0f\u5904\u7406\u654f\u611f\u95ee\u9898\uff0c\u5c0a\u91cd\u5f7c\u6b64\u6838\u5fc3\u5229\u76ca\u548c\u91cd\u5927\u5173\u5207\uff0c\u7ef4\u62a4\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\u52bf\u5934\u3002</p><p><!-- AD200x300_2 -->\n</p><p>\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u9ad8\u5ea6\u91cd\u89c6\u5bf9\u534e\u5173\u7cfb\u548c\u7f8e\u4e2d\u5408\u4f5c\uff0c\u613f\u540c\u4e2d\u65b9\u4e00\u9053\uff0c\u52a0\u5f3a\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u62d3\u5c55\u52a1\u5b9e\u9886\u57df\u5408\u4f5c\uff0c\u5904\u7406\u597d\u4e24\u56fd\u7ecf\u8d38\u4e2d\u7684\u95ee\u9898\uff0c\u63a8\u52a8\u53cc\u8fb9\u5173\u7cfb\u53d6\u5f97\u66f4\u5927\u53d1\u5c55\u3002</p><p>\u4e60\u8fd1\u5e73\u5e94\u8be2\u4ecb\u7ecd\u4e86\u5bf9\u5f53\u524d\u671d\u9c9c\u534a\u5c9b\u5c40\u52bf\u7684\u770b\u6cd5\uff0c\u6307\u51fa\u671d\u9c9c\u534a\u5c9b\u5f62\u52bf\u51fa\u73b0\u4e00\u4e9b\u79ef\u6781\u53d8\u5316\u3002\u5404\u65b9\u5e94\u8be5\u5171\u540c\u52aa\u529b\u628a\u6765\u4e4b\u4e0d\u6613\u7684\u7f13\u548c\u52bf\u5934\u5ef6\u7eed\u4e0b\u53bb\uff0c\u4e3a\u91cd\u542f\u5bf9\u8bdd\u8c08\u5224\u521b\u9020\u6761\u4ef6\u3002\u5b9e\u73b0\u671d\u9c9c\u534a\u5c9b\u65e0\u6838\u5316\uff0c\u7ef4\u62a4\u671d\u9c9c\u534a\u5c9b\u548c\u5e73\u7a33\u5b9a\u7b26\u5408\u5404\u65b9\u5171\u540c\u5229\u76ca\uff0c\u7ef4\u62a4\u56fd\u9645\u793e\u4f1a\u5728\u8fd9\u4e2a\u95ee\u9898\u4e0a\u7684\u56e2\u7ed3\u5341\u5206\u91cd\u8981\u3002\u4e2d\u65b9\u613f\u7ee7\u7eed\u540c\u5305\u62ec\u7f8e\u65b9\u5728\u5185\u7684\u56fd\u9645\u793e\u4f1a\u4e00\u9053\uff0c\u5bc6\u5207\u6c9f\u901a\u3001\u76f8\u4e92\u4fe1\u4efb\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u52a0\u5f3a\u5408\u4f5c\uff0c\u63a8\u52a8\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u671d\u7740\u59a5\u5584\u89e3\u51b3\u7684\u65b9\u5411\u4e0d\u65ad\u53d6\u5f97\u8fdb\u5c55\u3002</p><p>\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u91cd\u89c6\u4e2d\u65b9\u5728\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u4e0a\u7684\u91cd\u8981\u4f5c\u7528\uff0c\u613f\u7ee7\u7eed\u52a0\u5f3a\u540c\u4e2d\u65b9\u7684\u6c9f\u901a\u534f\u8c03\u3002</p><p></p>"}, "cmtId": "D897H80K0001899O"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/006769.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "006769", "comments": {"link": "http://coral.qq.com/2369397397"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006769.htm", "title": ["\u7fa4\u4f17\u53cd\u6620\u996e\u6c34\u95ee\u9898\u88ab\u603c\u201c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d \u5f53\u4e8b\u793e\u533a\u4e66\u8bb0\u88ab\u514d\u804c"], "passage": "\u5468\u65ed   \u622a\u5c4f\u56fe2018\u5e741\u670819\u65e5\u665a8\u65f6\u8bb8\uff0c\u6210\u90fd\u5e02\u6e29\u6c5f\u533a\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u5b98\u65b9\u5fae\u535a\u53d1\u5e03\u6d88\u606f\u79f0\uff1a\u7ecf\u6838\u5b9e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u7fa4\u4f17\u8fc7\u7a0b\u4e2d\u6001\u5ea6\u751f\u786c\uff0c\u8a00\u8bed\u4e0d\u5f53\uff0c\u9020\u6210\u8d1f\u9762\u5f71\u54cd\uff0c\u6709\u635f\u57fa\u5c42\u515a\u5458\u5e72\u90e8\u5f62\u8c61\u30021\u670819\u65e5\uff0c\u7ecf\u9547\u515a\u59d4\u7814\u7a76\uff0c\u51b3\u5b9a\u514d\u53bb\u5468\u65ed\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u804c\u52a1\u3002\u4e00\u6bb5\u88ab\u66dd\u5149\u7684\u89c6\u9891\u663e\u793a\uff0c\u8fd1\u65e5\uff0c\u5728\u6210\u90fd\u5e02\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u503c\u73ed\u5ba4\u5185\uff0c\u6709\u7fa4\u4f17\u53cd\u6620\u996e\u7528\u6c34\u76f8\u5173\u95ee\u9898\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u8fc7\u7a0b\u4e2d\u5bf9\u7fa4\u4f17\u79f0\uff0c\u201c\u4e3a\u4eba\u6c11\u670d\u52a1\u4e0d\u662f\u4e3a\u516c\u6c11\u670d\u52a1\uff0c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d\uff0c\u5f15\u53d1\u5e7f\u6cdb\u8206\u8bba\u5173\u6ce8\u3002\u89c6\u9891\u4e2d\uff0c\u5468\u65ed\u75285\u5206\u949f\u7ed9\u6765\u8bbf\u7fa4\u4f17\u8bb2\u89e3\u201c\u516c\u6c11\u201d\u4e0e\u201c\u4eba\u6c11\u201d\u7684\u533a\u522b\uff0c\u4e0d\u65f6\u7fd8\u7740\u4e8c\u90ce\u817f\uff0c\u6001\u5ea6\u968f\u610f\uff0c\u5e76\u79f0\u201c\u4f60\u76d1\u7763\u4e0d\u5230\u6211\u201d\u30021\u670819\u65e5\u665a\uff0c\u5f53\u4e8b\u4eba\u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u7531\u4e8e\u5de5\u7a0b\u65bd\u5de5\uff0c\u5979\u6240\u5c45\u4f4f\u7684\u5730\u65b9\u51e0\u5e74\u524d\u5730\u4e0b\u6c34\u67af\u7aed\uff0c\u540e\u7531\u793e\u533a\u534f\u8c03\u9001\u6c34\u89e3\u51b3\u65e5\u5e38\u7528\u6c34\u30022017\u5e7412\u670831\u65e5\uff0c\u5979\u8ba1\u5212\u5f53\u65e5\u5728\u5bb6\u4e3e\u529e\u751f\u65e5\u5bb4\u5e2d\uff0c\u5e76\u63d0\u524d\u4e24\u5929\u5411\u793e\u533a\u63d0\u51fa\u7528\u6c34\u7533\u8bf7\uff0c\u4f4612\u670830\u65e5\u4e2d\u5348\uff0c\u996e\u6c34\u4ecd\u6ca1\u6709\u9001\u5230\u3002\u201c31\u53f7\u65e9\u4e0a5\u70b9\u53a8\u5e08\u5c31\u8981\u8fc7\u6765\uff0c\u6ca1\u529e\u6cd5\u53ea\u80fd\u53c8\u8dd1\u8fc7\u53bb\u53cd\u6620\u60c5\u51b5\u3002\u201d\u9648\u5973\u58eb\u8bf4\uff0c\u5979\u548c\u5bb6\u4eba\u5148\u5230\u5929\u738b\u793e\u533a\uff0c\u540e\u53c8\u5230\u6e29\u6c5f\u533a\u653f\u5e9c\uff0c\u4e00\u76f4\u7b49\u523031\u65e5\u51cc\u6668\uff0c\u88ab\u544a\u77e5\u793e\u533a\u5c06\u5b89\u6392\u4eba\u5904\u7406\uff0c\u8ba9\u5979\u4eec\u5230\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u53bb\u7b49\u5f85\u3002\u5230\u8fbe\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u540e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u63a5\u5f85\u4e86\u5979\u4eec\u3002\u201c\u4e0d\u662f\u6765\u89e3\u51b3\u95ee\u9898\uff0c\u4e00\u5f00\u59cb\u5c31\u7ed9\u6211\u4eec\u2018\u666e\u6cd5\u2019\uff0c\u8bf4\u6211\u4eec\u4e0d\u662f\u4eba\u6c11\u3002\u201d\u9648\u5973\u58eb\u79f0\uff0c\u996e\u6c34\u6700\u7ec8\u6ca1\u6709\u9001\u6765\uff0c\u5979\u53ea\u597d\u8ba9\u4eb2\u4eba\u5e2e\u5fd9\u81ea\u5df1\u8fd0\u6c34\u8fc7\u6765\uff0c\u53c8\u4e70\u4e86\u4e9b\u6876\u88c5\u6c34\u56de\u6765\u3002\u56e0\u4e3a\u6c34\u4e0d\u591f\u7528\uff0c\u539f\u8ba1\u5212\u8bf7\u5ba220\u684c\uff0c\u6700\u540e\u53ea\u529e\u4e8613\u684c\u3002\u89c6\u9891\u66dd\u5149\u540e\uff0c\u5f15\u53d1\u5e7f\u6cdb\u70ed\u8bae\u3002 \u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u5f53\u5730\u653f\u5e9c\u4e5f\u76f8\u5f53\u91cd\u89c6\uff0c19\u65e5\u4e0b\u53482\u65f6\u8bb8\uff0c\u6e29\u6c5f\u533a\u7eaa\u59d4\u76d1\u5bdf\u5c40\u7684\u5de5\u4f5c\u4eba\u5458\u8054\u7cfb\u5979\uff0c\u5c31\u4e8b\u60c5\u7684\u7ecf\u8fc7\u8fdb\u884c\u4e86\u8be2\u95ee\uff0c\u5e76\u505a\u4e86\u7b14\u5f55\uff0c\u8be2\u95ee\u6301\u7eed\u4e863\u4e2a\u591a\u5c0f\u65f6\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/010301.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "010301", "comments": {"link": "http://coral.qq.com/2369810132"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010301.htm", "title": ["\u7f8e\u8230\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u9ec4\u5ca9\u5c9b\u9644\u8fd1\u9886\u6d77 \u5916\u4ea4\u90e8\u3001\u56fd\u9632\u90e8\u5f3a\u786c\u8868\u6001"], "passage": "\u65b0\u534e\u793e\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff0c\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u6177\u5f53\u65e5\u5c31\u7f8e\u56fd\u4e00\u8258\u5bfc\u5f39\u9a71\u9010\u8230\u8fdb\u5165\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u7b54\u8bb0\u8005\u95ee\u65f6\u8868\u793a\uff0c\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u6709\u8bb0\u8005\u95ee\uff1a\u636e\u4e86\u89e3\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u4ece\u9ec4\u5ca9\u5c9b\u897f\u5357\u4fa7\u8fdb\u5165\u8be5\u5c9b12\u6d77\u91cc\u8303\u56f4\u3002\u4e2d\u65b9\u5bf9\u6b64\u6709\u4f55\u8bc4\u8bba\uff1f\u9646\u6177\u8bf4\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u672a\u7ecf\u4e2d\u56fd\u653f\u5e9c\u5141\u8bb8\uff0c\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u3002\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u9646\u6177\u8868\u793a\uff0c\u7f8e\u65b9\u519b\u8230\u6709\u5173\u884c\u4e3a\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\uff0c\u5bf9\u4e2d\u65b9\u5728\u6709\u5173\u6d77\u57df\u5f00\u5c55\u6b63\u5e38\u516c\u52a1\u6d3b\u52a8\u7684\u8239\u53ea\u548c\u4eba\u5458\u5b89\u5168\u9020\u6210\u4e25\u91cd\u5a01\u80c1\uff0c\u8fdd\u80cc\u56fd\u9645\u5173\u7cfb\u57fa\u672c\u51c6\u5219\u3002\u4e2d\u65b9\u5bf9\u6b64\u8868\u793a\u5f3a\u70c8\u4e0d\u6ee1\uff0c\u5c06\u91c7\u53d6\u5fc5\u8981\u63aa\u65bd\uff0c\u575a\u5b9a\u7ef4\u62a4\u4e2d\u56fd\u4e3b\u6743\u3002\u9646\u6177\u8868\u793a\uff0c\u4e2d\u56fd\u5bf9\u9ec4\u5ca9\u5c9b\u53ca\u5176\u9644\u8fd1\u6d77\u57df\u62e5\u6709\u65e0\u53ef\u4e89\u8fa9\u7684\u4e3b\u6743\u3002\u4e2d\u65b9\u4e00\u5411\u5c0a\u91cd\u548c\u7ef4\u62a4\u5404\u56fd\u4f9d\u636e\u56fd\u9645\u6cd5\u5728\u5357\u6d77\u4eab\u6709\u7684\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\uff0c\u4f46\u575a\u51b3\u53cd\u5bf9\u4efb\u4f55\u56fd\u5bb6\u4ee5\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\u4e3a\u540d\uff0c\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\u3002\u201c\u6211\u4eec\u5f3a\u70c8\u6566\u4fc3\u7f8e\u65b9\u7acb\u5373\u7ea0\u6b63\u9519\u8bef\uff0c\u505c\u6b62\u6b64\u7c7b\u6311\u8845\u884c\u4e3a\uff0c\u4ee5\u514d\u635f\u5bb3\u4e2d\u7f8e\u5173\u7cfb\u548c\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002\u201d\u56fd\u9632\u90e8\u7f511\u670820\u65e5\u6d88\u606f\uff0c1\u670817\u65e5\uff0c\u7f8e\u56fd\u6d77\u519b\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b\u90bb\u8fd1\u6d77\u57df\uff0c\u4e2d\u56fd\u6d77\u519b\u201c\u9ec4\u5c71\u201d\u53f7\u5bfc\u5f39\u62a4\u536b\u8230\u5f53\u5373\u884c\u52a8\uff0c\u5bf9\u7f8e\u8230\u8fdb\u884c\u8bc6\u522b\u67e5\u8bc1\uff0c\u5e76\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u5f53\u524d\uff0c\u5728\u4e2d\u56fd\u548c\u4e1c\u76df\u56fd\u5bb6\u7684\u5171\u540c\u52aa\u529b\u4e0b\uff0c\u5357\u6d77\u5c40\u52bf\u4e0d\u65ad\u8d8b\u7a33\u5411\u597d\u3002\u5728\u6b64\u5f62\u52bf\u4e0b\uff0c\u7f8e\u65b9\u4e00\u518d\u6d3e\u9063\u519b\u8230\u975e\u6cd5\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u5c9b\u7901\u90bb\u8fd1\u6d77\u57df\uff0c\u5371\u53ca\u53cc\u65b9\u8230\u673a\u548c\u4eba\u5458\u5b89\u5168\uff0c\u5a01\u80c1\u4e2d\u56fd\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u7834\u574f\u5730\u533a\u548c\u5e73\u7a33\u5b9a\uff0c\u4e0e\u4e24\u56fd\u4e24\u519b\u5173\u7cfb\u7a33\u5b9a\u53d1\u5c55\u7684\u52bf\u5934\u80cc\u9053\u800c\u9a70\u3002\u6211\u4eec\u5e0c\u671b\u7f8e\u65b9\u5c0a\u91cd\u4e2d\u65b9\u4e3b\u6743\uff0c\u5c0a\u91cd\u57df\u5185\u56fd\u5bb6\u7684\u52aa\u529b\uff0c\u4e0d\u8981\u65e0\u4e8b\u751f\u975e\uff0c\u5174\u98ce\u4f5c\u6d6a\u3002\u4e2d\u56fd\u519b\u961f\u5c06\u7ee7\u7eed\u5c65\u884c\u9632\u536b\u804c\u8d23\uff0c\u52a0\u5927\u6d77\u7a7a\u5de1\u903b\u8b66\u6212\u529b\u5ea6\uff0c\u575a\u5b9a\u634d\u536b\u56fd\u5bb6\u7684\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u575a\u5b9a\u7ef4\u62a4\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002"}}


--------------------------------------------------------------------------------
/机器学习入门/标签传播算法(LP).py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2017/3/25 11:28
  4 | # @Author  : mazicwong
  5 | # @File    : 标签传播算法(LP).py
  6 | import time
  7 | import math
  8 | import numpy as np
  9 | from label_propagation import labelPropagation
 10 | 
 11 | 
 12 | # show
 13 | def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels):
 14 |     import matplotlib.pyplot as plt
 15 | 
 16 |     for i in range(Mat_Label.shape[0]):
 17 |         if int(labels[i]) == 0:
 18 |             plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr')
 19 |         elif int(labels[i]) == 1:
 20 |             plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db')
 21 |         else:
 22 |             plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy')
 23 | 
 24 |     for i in range(Mat_Unlabel.shape[0]):
 25 |         if int(unlabel_data_labels[i]) == 0:
 26 |             plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or')
 27 |         elif int(unlabel_data_labels[i]) == 1:
 28 |             plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob')
 29 |         else:
 30 |             plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy')
 31 | 
 32 |     plt.xlabel('X1');
 33 |     plt.ylabel('X2')
 34 |     plt.xlim(0.0, 12.)
 35 |     plt.ylim(0.0, 12.)
 36 |     plt.show()
 37 | 
 38 | 
 39 | def loadCircleData(num_data):
 40 |     center = np.array([5.0, 5.0])
 41 |     radiu_inner = 2
 42 |     radiu_outer = 4
 43 |     num_inner = num_data / 3
 44 |     num_outer = num_data - num_inner
 45 | 
 46 |     data = []
 47 |     theta = 0.0
 48 |     for i in range(int(num_inner)):
 49 |         pho = (theta % 360) * math.pi / 180
 50 |         tmp = np.zeros(2, np.float32)
 51 |         tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0]
 52 |         tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1]
 53 |         data.append(tmp)
 54 |         theta += 2
 55 | 
 56 |     theta = 0.0
 57 |     for i in range(int(num_outer)):
 58 |         pho = (theta % 360) * math.pi / 180
 59 |         tmp = np.zeros(2, np.float32)
 60 |         tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0]
 61 |         tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1]
 62 |         data.append(tmp)
 63 |         theta += 1
 64 | 
 65 |     Mat_Label = np.zeros((2, 2), np.float32)
 66 |     Mat_Label[0] = center + np.array([-radiu_inner + 0.5, 0])
 67 |     Mat_Label[1] = center + np.array([-radiu_outer + 0.5, 0])
 68 |     labels = [0, 1]
 69 |     Mat_Unlabel = np.vstack(data)
 70 |     return Mat_Label, labels, Mat_Unlabel
 71 | 
 72 | 
 73 | def loadBandData(num_unlabel_samples):
 74 |     # Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]])
 75 |     # labels = [0, 1]
 76 |     # Mat_Unlabel = np.array([[5.1, 2.], [5.0, 8.1]])
 77 | 
 78 |     Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]])
 79 |     labels = [0, 1]
 80 |     num_dim = Mat_Label.shape[1]
 81 |     Mat_Unlabel = np.zeros((num_unlabel_samples, num_dim), np.float32)
 82 |     Mat_Unlabel[:num_unlabel_samples / 2, :] = (np.random.rand(num_unlabel_samples / 2, num_dim) - 0.5) * np.array(
 83 |         [3, 1]) + Mat_Label[0]
 84 |     Mat_Unlabel[num_unlabel_samples / 2: num_unlabel_samples, :] = (np.random.rand(num_unlabel_samples / 2,
 85 |                                                                                    num_dim) - 0.5) * np.array([3, 1]) + \
 86 |                                                                    Mat_Label[1]
 87 |     return Mat_Label, labels, Mat_Unlabel
 88 | 
 89 | 
 90 | # main function
 91 | if __name__ == "__main__":
 92 |     num_unlabel_samples = 800
 93 |     # Mat_Label, labels, Mat_Unlabel = loadBandData(num_unlabel_samples)
 94 |     Mat_Label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples)
 95 | 
 96 |     ## Notice: when use 'rbf' as our kernel, the choice of hyper parameter 'sigma' is very import! It should be
 97 |     ## chose according to your dataset, specific the distance of two data points. I think it should ensure that
 98 |     ## each point has about 10 knn or w_i,j is large enough. It also influence the speed of converge. So, may be
 99 |     ## 'knn' kernel is better!
100 |     # unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.2)
101 |     unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='knn', knn_num_neighbors=10,
102 |                                            max_iter=400)
103 |     show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels)


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8HJ6VRF0001875O.json:
--------------------------------------------------------------------------------
1 | {"source": "netease", "cmtId": "D8HJ6VRF0001875O", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_guoji2_bbs/D8HJ6VRF0001875O.html"}, "newsId": "D8HJ6VRF0001875O", "contents": {"passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94\uff09\n                </p><p><strong>\u6d77\u5916\u7f511\u670819\u65e5\u7535\u00a0</strong>\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u617719\u65e5\u4e3b\u6301\u4f8b\u884c\u8bb0\u8005\u4f1a\uff0c\u5c31\u8fd1\u671f\u70ed\u70b9\u8fdb\u884c\u56de\u5e94\u3002\u76f8\u5173\u5185\u5bb9\u5982\u4e0b\uff1a</p><p><strong>\u95ee\uff1a\u5a92\u4f53\u62ab\u9732\u7684\u6700\u65b0\u536b\u661f\u56fe\u50cf\u663e\u793a\uff0c\u4e2d\u56fd\u6b63\u5728\u8ddd\u6d1e\u6717\u5bf9\u5cd9\u53d1\u751f\u5730\u5f88\u8fd1\u7684\u5730\u65b9\u4fee\u5efa\u5e9e\u5927\u7684\u519b\u4e8b\u8bbe\u65bd\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u53d1\u8868\u58f0\u660e\u91cd\u7533\u8be5\u8bbe\u65bd\u5e76\u4e0d\u5728\u5bf9\u5cd9\u5730\u533a\u3002\u4f46\u8fd9\u5728\u5370\u5ea6\u653f\u515a\u4e2d\u5f15\u53d1\u4e86\u62c5\u5fe7\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u8fd8\u79f0\u201c\u6b64\u524d\u5bf9\u5cd9\u5730\u70b9\u7684\u73b0\u72b6\u5e76\u672a\u53d1\u751f\u6539\u53d8\u201d\u3002\u4e2d\u65b9\u5bf9\u6709\u5173\u62a5\u9053\u6709\u4f55\u8bc4\u8bba\uff1f</strong></p><p>\u7b54\uff1a\u6211\u521a\u521a\u6ce8\u610f\u5230\u6709\u5173\u62a5\u9053\uff0c\u4e0d\u4e86\u89e3\u5177\u4f53\u60c5\u51b5\uff0c\u4e5f\u4e0d\u6e05\u695a\u4f60\u6240\u8bf4\u7684\u536b\u661f\u56fe\u50cf\u6765\u6e90\u3002</p><p>\u76f8\u4fe1\u4f60\u975e\u5e38\u6e05\u695a\u4e2d\u65b9\u5728\u6d1e\u6717\u95ee\u9898\u4e0a\u7684\u7acb\u573a\u3002\u6d1e\u6717\u5730\u533a\u5386\u6765\u5c5e\u4e8e\u4e2d\u56fd\uff0c\u4e00\u76f4\u5728\u4e2d\u56fd\u6709\u6548\u7ba1\u8f96\u4e4b\u4e0b\uff0c\u4e0d\u5b58\u5728\u4e89\u8bae\u3002\u4e3a\u4e86\u5b88\u8fb9\u9700\u8981\u548c\u6539\u5584\u5f53\u5730\u519b\u6c11\u7684\u751f\u4ea7\u751f\u6d3b\u6761\u4ef6\uff0c\u4e2d\u65b9\u957f\u671f\u4ee5\u6765\u4e00\u76f4\u5728\u6d1e\u6717\u5730\u533a\u8fdb\u884c\u5305\u62ec\u9053\u8def\u5728\u5185\u7684\u57fa\u7840\u8bbe\u65bd\u5efa\u8bbe\uff0c\u8fd9\u662f\u4e2d\u65b9\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\uff0c\u5b8c\u5168\u6b63\u5f53\u5408\u6cd5\u3002\u6b63\u5982\u4e2d\u65b9\u4e0d\u4f1a\u5bf9\u5370\u65b9\u5728\u5370\u5ea6\u9886\u571f\u4e0a\u7684\u5efa\u8bbe\u6d3b\u52a8\u54c1\u5934\u8bba\u8db3\u4e00\u6837\uff0c\u5176\u4ed6\u56fd\u5bb6\u5bf9\u4e2d\u56fd\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\u54c1\u5934\u8bba\u8db3\u4e5f\u662f\u4e0d\u5408\u9002\u7684\u3002</p><p><!-- AD200x300_2 -->\n</p><p><strong>\u95ee\uff1a\u8003\u8651\u5230\u8fd9\u4e2a\u62a5\u9053\u8868\u8fbe\u4e86\u5bf9\u6d1e\u6717\u5730\u533a\u518d\u6b21\u53d1\u751f\u5bf9\u5cd9\u7684\u62c5\u5fe7\u3002\u53bb\u5e74\u7684\u5bf9\u5cd9\u4e8b\u4ef6\u5e94\u8be5\u5df2\u7ecf\u5f97\u5230\u4e86\u89e3\u51b3\uff0c\u4f60\u8ba4\u4e3a\u4f1a\u518d\u6b21\u53d1\u751f\u7c7b\u4f3c\u4e8b\u4ef6\u5417\uff1f</strong></p><p>\u7b54\uff1a\u6709\u5173\u5370\u5ea6\u8fb9\u9632\u90e8\u961f\u8d8a\u754c\u9020\u6210\u7684\u6d1e\u6717\u5bf9\u5cd9\u4e8b\u4ef6\uff0c\u524d\u4e24\u5929\u6211\u5df2\u7ecf\u8bf4\u8fc7\uff0c\u5370\u5ea6\u519b\u65b9\u7684\u9ad8\u5b98\u4e5f\u627f\u8ba4\u662f\u5370\u5ea6\u519b\u961f\u8d8a\u754c\u4e86\u3002\u8fd9\u4e00\u4e8b\u4ef6\u4f7f\u4e2d\u5370\u53cc\u8fb9\u5173\u7cfb\u7ecf\u53d7\u4e86\u4e25\u5cfb\u8003\u9a8c\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u519b\u65b9\u80fd\u591f\u6c72\u53d6\u6559\u8bad\uff0c\u907f\u514d\u7c7b\u4f3c\u4e8b\u60c5\u518d\u6b21\u53d1\u751f\u3002\u4e2d\u5370\u4e24\u56fd\u9886\u5bfc\u4eba\u5728\u53bb\u5e749\u6708\u91d1\u7816\u56fd\u5bb6\u9886\u5bfc\u4eba\u53a6\u95e8\u4f1a\u6664\u671f\u95f4\uff0c\u5df2\u7ecf\u5c31\u5982\u4f55\u5728\u65b0\u5f62\u52bf\u4e0b\u8fdb\u4e00\u6b65\u6539\u5584\u548c\u53d1\u5c55\u4e2d\u5370\u5173\u7cfb\u8fbe\u6210\u4e86\u91cd\u8981\u5171\u8bc6\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u6709\u5173\u65b9\u9762\u80fd\u5207\u5b9e\u9075\u7167\u4e24\u56fd\u9886\u5bfc\u4eba\u8fbe\u6210\u7684\u91cd\u8981\u5171\u8bc6\uff0c\u540c\u4e2d\u65b9\u76f8\u5411\u800c\u884c\uff0c\u5171\u540c\u7ef4\u62a4\u8fb9\u5883\u5730\u533a\u7684\u548c\u5e73\u7a33\u5b9a\uff0c\u5171\u540c\u81f4\u529b\u4e8e\u4e2d\u5370\u5173\u7cfb\u7684\u6539\u5584\u53d1\u5c55\u3002</p><p></p>", "link": "http://news.163.com/18/0119/18/D8HJ6VRF0001875O.html", "title": ["\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94"]}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/009612.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "009612", "comments": {"link": "http://coral.qq.com/2369744788"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/009612.htm", "title": ["\u592e\u884c\u53d1\u5e03\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u901a\u77e5 \u2161\u3001\u2162\u7c7b\u6237\u5f00\u6237\u5c06\u66f4\u4fbf\u6377"], "passage": "\u592e\u5e7f\u7f51\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff08\u8bb0\u8005\u67f4\u534e\uff09\u636e\u4e2d\u56fd\u4e4b\u58f0\u300a\u592e\u5e7f\u65b0\u95fb\u300b\u62a5\u9053\uff0c\u6628\u5929\uff0819\u65e5\uff09\u665a\u95f4\uff0c\u592e\u884c\u5b98\u7f51\u53d1\u5e03\u300a\u5173\u4e8e\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u6709\u5173\u4e8b\u9879\u7684\u901a\u77e5\u300b\uff0c\u5ba3\u5e03\u8fdb\u4e00\u6b65\u53d1\u6325\u2162\u7c7b\u6237\u5728\u5c0f\u989d\u652f\u4ed8\u9886\u57df\u7684\u4f5c\u7528\uff0c\u63a8\u52a8\u2161\u3001\u2162\u7c7b\u6237\u6210\u4e3a\u4e2a\u4eba\u529e\u7406\u7f51\u4e0a\u652f\u4ed8\u3001\u79fb\u52a8\u652f\u4ed8\u7b49\u5c0f\u989d\u6d88\u8d39\u4e1a\u52a1\u7684\u4e3b\u8981\u6e20\u9053\u3002\u6839\u636e\u300a\u901a\u77e5\u300b\u548c\u7b54\u8bb0\u8005\u95ee\u7684\u89e3\u91ca\uff0c\u4e00\u662f\u5f00\u6237\u6e20\u9053\u591a\u6837\u3002\u300a\u901a\u77e5\u300b\u8981\u6c42\u56fd\u6709\u5546\u4e1a\u94f6\u884c\u3001\u80a1\u4efd\u5236\u5546\u4e1a\u94f6\u884c\u7b49\u5e94\u4e8e2018\u5e746\u6708\u5e95\u524d\u5b9e\u73b0\u672c\u94f6\u884c\u67dc\u9762\u548c\u7f51\u4e0a\u94f6\u884c\u3001\u624b\u673a\u94f6\u884c\u3001\u76f4\u9500\u94f6\u884c\u3001\u8fdc\u7a0b\u89c6\u9891\u67dc\u5458\u673a\u548c\u667a\u80fd\u67dc\u5458\u673a\u7b49\u7535\u5b50\u6e20\u9053\u529e\u7406\u4e2a\u4eba\u2161\u3001\u2162\u7c7b\u6237\u5f00\u7acb\u7b49\u4e1a\u52a1\uff0c\u5176\u4ed6\u94f6\u884c\u5219\u5e94\u57282018\u5e74\u5e95\u524d\u5b9e\u73b0\u3002\u4e8c\u662f\u5f00\u6237\u624b\u7eed\u7b80\u5316\u3002\u300a\u901a\u77e5\u300b\u660e\u786e\u4e00\u5b9a\u524d\u63d0\u4e0b\u5f00\u7acb\u2161\u3001\u2162\u7c7b\u6237\u65f6\u65e0\u9700\u4e2a\u4eba\u586b\u5199\u8eab\u4efd\u4fe1\u606f\u3001\u51fa\u793a\u8eab\u4efd\u8bc1\u4ef6\u7b49\uff0c\u5728\u6709\u6548\u843d\u5b9e\u8d26\u6237\u5b9e\u540d\u5236\u8981\u6c42\u7684\u540c\u65f6\uff0c\u5927\u5e45\u63d0\u5347\u5f00\u6237\u4f53\u9a8c\u3002\u5176\u6b21\uff0c\u5728\u8d26\u6237\u4f7f\u7528\u65b9\u9762\uff0c\u5728\u6ee1\u8db3\u53cd\u6d17\u94b1\u3001\u53cd\u8bc8\u9a97\u8981\u6c42\u7684\u524d\u63d0\u4e0b\uff0c\u653e\u5bbd\u2162\u7c7b\u6237\u7684\u4f7f\u7528\u9650\u5236\u3002\u4e00\u662f\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u2162\u7c7b\u6237\u80fd\u591f\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u4ee5\u6ee1\u8db3\u4e2a\u4eba\u4e4b\u95f4\u5c0f\u989d\u6536\u4ed8\u6b3e\u3001\u53d1\u653e\u7ea2\u5305\u3001\u4e0e\u4e2a\u4eba\u652f\u4ed8\u8d26\u6237\u5bf9\u63a5\u3001\u94f6\u884c\u6216\u5546\u6237\u5c0f\u989d\u8fd4\u73b0\u5956\u52b1\u7b49\u573a\u666f\u9700\u6c42\u3002\u4e8c\u662f\u2162\u7c7b\u6237\u8d26\u6237\u4f59\u989d\u4ece1000\u5143\u63d0\u5347\u4e3a2000\u5143\u3002\u4e09\u662f\u5141\u8bb8\u94f6\u884c\u5411\u2162\u7c7b\u6237\u53d1\u653e\u672c\u884c\u5c0f\u989d\u6d88\u8d39\u8d37\u6b3e\u5e76\u901a\u8fc7\u2162\u7c7b\u6237\u8fd8\u6b3e\uff0c\u9f13\u52b1\u94f6\u884c\u57fa\u4e8e\u2162\u7c7b\u6237\u63d0\u4f9b\u66f4\u591a\u5143\u5316\u7684\u4ea7\u54c1\u8bbe\u8ba1\u548c\u529f\u80fd\u7ec4\u5408\u3002\u592e\u884c\u8868\u793a\uff0c\u300a\u901a\u77e5\u300b\u91c7\u53d6\u4e86\u591a\u79cd\u5b89\u5168\u9632\u8303\u63aa\u65bd\u3002\u4e00\u662f\u5c06\u2162\u7c7b\u6237\u6d88\u8d39\u548c\u7f34\u8d39\u652f\u4ed8\u3001\u975e\u7ed1\u5b9a\u8d26\u6237\u8d44\u91d1\u8f6c\u51fa\u7b49\u51fa\u91d1\u7684\u65e5\u7d2f\u8ba1\u9650\u989d\u4ece\u539f5000\u5143\u4e0b\u8c03\u81f32000\u5143\uff0c\u5e74\u7d2f\u8ba1\u9650\u989d\u4ece\u539f10\u4e07\u5143\u4e0b\u8c03\u4e3a5\u4e07\u5143\uff0c\u901a\u8fc7\u63a7\u5236\u2162\u7c7b\u6237\u652f\u51fa\u989d\u5ea6\uff0c\u786e\u4fdd\u98ce\u9669\u76f8\u5bf9\u53ef\u63a7\u3002\u4e8c\u662f\u89c4\u5b9a\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u7684\u2162\u7c7b\u6237\u901a\u8fc7\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u540e\uff0c\u624d\u53ef\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u9632\u8303\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u83b7\u53d6\u4ed6\u4eba\u8eab\u4efd\u4fe1\u606f\u548c\u94f6\u884c\u8d26\u6237\u4fe1\u606f\u540e\u5192\u540d\u5f00\u7acb\u3002\u4e09\u662f\u89c4\u5b9a\u540c\u4e00\u5bb6\u94f6\u884c\u901a\u8fc7\u7ebf\u4e0a\u4e3a\u540c\u4e00\u4e2a\u4eba\u53ea\u80fd\u5f00\u7acb\u4e00\u4e2a\u5141\u8bb8\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u7684\u2162\u7c7b\u6237\uff0c\u9632\u6b62\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u5f00\u7acb\u591a\u4e2a\u6b64\u7c7b\u8d26\u6237\u53d8\u76f8\u6269\u5927\u2162\u7c7b\u6237\u7684\u8f6c\u8d26\u9650\u989d\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20171009/039986.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "039986", "comments": {"link": "http://coral.qq.com/2166352744"}, "date": "20171009", "contents": {"link": "https://news.qq.com/a/20171009/039986.htm", "title": ["\u8bbe\u8ba1\u7f8e\u5b66\uff0c\u8ba9\u6b27\u7c73\u8304\u6d77\u9a6cAqua Terra\u8155\u8868\u7115\u7136\u4e00\u65b0"], "passage": "[]\u6b27\u7c73\u8304\u63a8\u51fa\u6d77\u9a6c\u7cfb\u5217Aqua Terra\u5168\u65b0\u8868\u6b3e\uff0c\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u57fa\u7840\u4e0a\uff0c\u878d\u5165\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\u3002\u6b27\u7c73\u8304\u5168\u65b0\u53d1\u5e03\u7684\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\uff0c\u7b80\u7ea6\u3001\u5927\u6c14\uff0c\u5448\u73b0\u5e73\u8861\u4e4b\u7f8e\u3002\u8868\u6b3e\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u5143\u7d20\u4e2d\u878d\u5165\u8bf8\u591a\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\uff0c\u901a\u8fc7\u81f3\u81fb\u5929\u6587\u53f0\u8ba4\u8bc1\uff0c\u521b\u9020\u7684\u4f18\u96c5\u5353\u8d8a\u9b45\u529b\u65f6\u8ba1\uff0c\u4ee4\u4eba\u96be\u4ee5\u6297\u62d2\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u914d\u5907\u4e09\u89d2\u5f62\u5c0f\u65f6\u523b\u5ea6\uff0c\u98ce\u683c\u5927\u6c14\u7eaf\u7cb9\uff0c\u540c\u65f6\u62e5\u6709\u5f88\u9ad8\u7684\u6613\u8bfb\u6027\u3002\u8fd9\u6b21\uff0c\u6b27\u7c73\u8304\u5c06\u8868\u76d8\u8bbe\u8ba1\u518d\u7b80\u5316\uff0c\u5728\u4fdd\u7559\u7ecf\u5178\u7684\u5f27\u5f62\u8868\u8033\u7684\u57fa\u7840\u4e0a\u5bf9\u8868\u58f3\u8fdb\u884c\u4e86\u91cd\u65b0\u8bbe\u8ba1\uff0c\u4e3a\u8868\u80cc\u589e\u6dfb\u6ce2\u7eb9\u8fb9\u7f18\uff0c\u4ee4\u6574\u679a\u8155\u8868\u5c55\u73b0\u5bf9\u79f0\u4e4b\u7f8e\u3002\u539f\u672c\u8868\u76d8\u4e0a\u7684\u9632\u6c34\u7cfb\u6570\u5b57\u6837\u88ab\u8f6c\u79fb\u81f3\u8868\u80cc\uff0c\u65e5\u671f\u7a97\u53e3\u4e5f\u75313\u70b9\u4f4d\u7f6e\u8c03\u6574\u81f36\u70b9\u4f4d\u7f6e\uff0c\u7528\u4ee5\u81f4\u656c1952\u5e74\u63a8\u51fa\u7684\u9996\u6b3e\u5e26\u6709\u65e5\u671f\u7a97\u663e\u793a\u7684\u6b27\u7c73\u8304\u8155\u8868\uff0c\u8d2f\u5f7b\u5bf9\u79f0\u7b80\u7ea6\u7684\u8bbe\u8ba1\u7f8e\u5b66\u3002\u67da\u6728\u7eb9\u7406\u8868\u76d8\u582a\u79f0\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u6700\u4e3a\u663e\u8457\u7684\u7279\u5f81\uff0c\u5176\u8bbe\u8ba1\u7075\u611f\u6765\u6e90\u4e8e\u6e38\u8247\u4e0a\u7684\u67da\u6728\u7532\u677f\u30022017\u5e74\uff0c\u6b27\u7c73\u8304\u5c06\u6807\u5fd7\u6027\u7684\u5782\u76f4\u7eb9\u7406\u53d8\u4e3a\u6c34\u5e73\u7eb9\u7406\uff0c\u4ee4\u6574\u679a\u8155\u8868\u66f4\u663e\u7cbe\u81f4\uff0c\u7115\u53d1\u5d2d\u65b0\u9b45\u529b\u3002\u8bbe\u8ba1\u7b80\u6d01\u53c8\u4e0d\u5931\u7cbe\u81f4\uff0c\u5448\u73b0\u4e86\u4e0e\u6d77\u6d0b\u76f8\u5951\u5408\u7684\u4f11\u95f2\u751f\u6d3b\u65b9\u5f0f\u3002\u6b27\u7c73\u8304\u5728\u6b64\u6b3e\u8868\u5e26\u8bbe\u8ba1\u4e0a\u4e5f\u5320\u5fc3\u72ec\u8fd0\uff0c\u90e8\u5206\u8868\u6b3e\u642d\u914d\u6a61\u80f6\u8868\u5e26\uff0c\u521b\u9020\u6027\u5730\u901a\u8fc7\u7cbe\u94a2\u6216Sedna\u00ae 18K\u91d1\u94fe\u8282\u5c06\u8868\u5e26\u4e0e\u8868\u58f3\u76f8\u8fde\uff0c\u4ee4\u8155\u8868\u62e5\u6709\u8212\u9002\u79f0\u624b\u7684\u4f69\u5e26\u611f\u53d7\uff0c\u66f4\u52a0\u5bcc\u6709\u8fd0\u52a8\u6c14\u606f\u3002\u91d1\u5c5e\u8868\u94fe\u5219\u8fd0\u7528\u4e86\u6b27\u7c73\u8304\u4e13\u5229\u7684\u94fe\u9488\u8868\u94fe\u4e0e\u66f4\u4e3a\u575a\u56fa\u7684\u94fe\u8282\uff0c\u6574\u4f53\u7f8e\u611f\u500d\u589e\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u7cfb\u5217\u62e5\u670941mm\u548c38mm\u4e24\u79cd\u8868\u58f3\u5c3a\u5bf8\u3002\u8155\u8868\u8868\u58f3\u91c7\u7528\u7cbe\u94a2\u3001Sedna\u00ae 18K\u91d1\u6216\u7cbe\u94a2\u4e0eSedna\u00ae 18K\u91d1\u6df7\u5408\u6253\u9020\u800c\u6210\uff0c\u5177\u6709\u4e30\u5bcc\u7684\u8868\u6b3e\u53ef\u4f9b\u9009\u62e9\u3002\u540c\u65f6\u8155\u8868\u8fd8\u53ef\u642d\u914d\u7cbe\u94a2\u8868\u94fe\u3001\u76ae\u9769\u8868\u5e26\u6216\u9020\u578b\u7cbe\u81f4\u7684\u6a61\u80f6\u8868\u5e26\u3002\u591a\u79cd\u4e0d\u540c\u8868\u6b3e\uff0c\u642d\u914d\u7537\u58eb\u72ec\u4e00\u65e0\u4e8c\u7684\u98ce\u683c\u3002\u9646\u5730\u4e0e\u6d77\u6d0b\u3001\u4f20\u627f\u4e0e\u521b\u65b0\u3001\u5de5\u4f5c\u4e0e\u4f11\u95f2\uff0c\u8fd9\u5c31\u662f\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u6240\u878d\u5408\u7684\u72ec\u7279\u9b45\u529b\uff0c\u4ee4\u5176\u6210\u4e3a\u5de5\u4f5c\u751f\u6d3b\uff0c\u65f6\u5c1a\u642d\u914d\u4e2d\u7684\u81f3\u81fb\u4e4b\u9009\u3002\u8bf7\u70b9\u51fb\u94fe\u63a5\uff0c\u4e86\u89e3\u66f4\u591a\u4ea7\u54c1\u4fe1\u606f\u3002\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/tencent/20180120/004124.json:
--------------------------------------------------------------------------------
1 | {"source": "tencent", "newsId": "004124", "comments": {"link": "http://coral.qq.com/2369229201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004124.htm", "title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "passage": "\u4e2d\u65b0\u7f511\u670820\u65e5\u7535 \u636e\u4e2d\u592e\u6c14\u8c61\u53f0\u7f51\u7ad9\u6d88\u606f\uff0c\u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff0c\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002\u6b64\u5916\uff0c20\u81f321\u65e5\u6cb3\u5317\u5357\u90e8\u3001\u6cb3\u5357\u3001\u5c71\u4e1c\u4e2d\u897f\u90e8\u3001\u6c5f\u82cf\u3001\u5b89\u5fbd\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7ef4\u6301\uff0c\u5176\u4e2d21\u65e5\u53d7\u504f\u4e1c\u8def\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u6cb3\u5317\u4e1c\u90e8\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7565\u6709\u51cf\u5f31\u300222\u65e5\u591c\u95f4\u8d77\uff0c\u53d7\u8f83\u5f3a\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u533a\u57df\u91cd\u6c61\u67d3\u5929\u6c14\u81ea\u5317\u5411\u5357\u9010\u6e10\u51cf\u5f31\u6d88\u6563\u300220\u65e5\uff0c\u53d7\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u5185\u8499\u53e4\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u7b49\u5730\u6709\u5927\u98ce\u964d\u6e29\u5929\u6c14\uff0c\u964d\u6e29\u5e45\u5ea6\u57284~6\u2103\uff0c\u5c40\u5730\u53ef\u8fbe8\u2103\u4ee5\u4e0a\uff0c\u5e76\u4f34\u67094~6\u7ea7\u98ce\u300222\u65e5\u8d77\uff0c\u65b0\u4e00\u80a1\u51b7\u7a7a\u6c14\u5c06\u5f71\u54cd\u6211\u56fd\u4e2d\u4e1c\u90e8\u5730\u533a\uff0c\u957f\u6c5f\u4e2d\u4e0b\u6e38\u53ca\u5176\u4ee5\u5317\u5730\u533a\u67094~6\u7ea7\u504f\u5317\u98ce\uff0c\u4e2d\u4e1c\u90e8\u5927\u90e8\u5730\u533a\u6c14\u6e29\u5c06\u4e0b\u964d4~8\u2103\uff0c\u5185\u8499\u53e4\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u90e8\u5c40\u5730\u964d\u6e2910\u2103\u4ee5\u4e0a\u3002\u672a\u6765\u4e09\u5929\u9884\u62a5\u65b9\u9762\uff0c20\u65e508\u65f6\u81f321\u65e508\u65f6\uff0c\u65b0\u7586\u4f0a\u7281\u6cb3\u8c37\u548c\u5929\u5c71\u5730\u533a\u3001\u7518\u8083\u897f\u90e8\u3001\u6cb3\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u5730\u533a\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96ea\u6216\u9635\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6c5f\u6c49\u5357\u90e8\u3001\u6c5f\u5357\u897f\u90e8\u548c\u5317\u90e8\u3001\u534e\u5357\u897f\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u897f\u90e8\u3001\u9ed1\u9f99\u6c5f\u5317\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u300221\u65e508\u65f6\u81f322\u65e508\u65f6\uff0c\u5357\u65b9\u964d\u6c34\u8303\u56f4\u6269\u5927\u3002\u534e\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u3001\u5c71\u4e1c\u4e1c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff0c\u5176\u4e2d\uff0c\u6cb3\u5317\u5317\u90e8\u3001\u5c71\u4e1c\u534a\u5c9b\u5c40\u5730\u6709\u4e2d\u96ea\uff1b\u6c5f\u6dee\u4e1c\u90e8\u548c\u5357\u90e8\u3001\u6e56\u5317\u5357\u90e8\u3001\u6c5f\u5357\u3001\u897f\u5357\u5730\u533a\u4e1c\u5357\u90e8\u3001\u534e\u5357\u897f\u90e8\u548c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u4e2d\u897f\u90e8\u3001\u8fbd\u4e1c\u534a\u5c9b\u3001\u5c71\u4e1c\u534a\u5c9b\u7b49\u5730\u67094~6\u7ea7\u98ce\u300222\u65e508\u65f6\u81f323\u65e508\u65f6\uff0c\u6cb3\u5317\u4e2d\u90e8\u3001\u5c71\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u5357\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6e56\u5317\u897f\u90e8\u3001\u6e56\u5357\u897f\u90e8\u548c\u5357\u90e8\u3001\u5e7f\u897f\u7b49\u5730\u6709\u5c0f\u96e8\u3002\u5185\u8499\u53e4\u5927\u90e8\u3001\u534e\u5317\u3001\u8fbd\u5b81\u3001\u9ec4\u6dee\u4e1c\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u3002\u4e1c\u6d77\u5927\u90e8\u3001\u5357\u6d77\u4e1c\u5317\u90e8\u5c06\u67096~8\u7ea7\u3001\u9635\u98ce9\u7ea7\u5927\u98ce\u3002\u9632\u5fa1\u6307\u5357\uff1a1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168\uff1b2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002"}}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/docs/netease/20180119/D8GOCKJU0001899N.json:
--------------------------------------------------------------------------------
1 | {"newsId": "D8GOCKJU0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8GOCKJU0001899N.html"}, "contents": {"title": ["\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d"], "link": "http://news.163.com/18/0119/10/D8GOCKJU0001899N.html", "passage": "<p class=\"otitle\">\n                    \uff08\u539f\u6807\u9898\uff1a\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\uff09\n                </p><p>2017\u5e74\u5168\u56fd\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u548c\u4eba\u53e3\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\u3002\u56fd\u5bb6\u7edf\u8ba1\u5c4018\u65e5\u516c\u5e03\u6570\u636e\u663e\u793a\uff0c2017\u5e74\u5168\u5e74\u5171\u51fa\u751f\u4eba\u53e31723\u4e07\u4eba\uff0c\u6bd42016\u5e74\u51cf\u5c1163\u4e07\u4eba\u3002\u540c\u65f6\u8001\u9f84\u5316\u7a0b\u5ea6\u7ee7\u7eed\u52a0\u5927\uff0c60\u5c81\u4ee5\u4e0a\u53ca65\u5c81\u4ee5\u4e0a\u8001\u4eba\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u90fd\u6709\u660e\u663e\u4e0a\u5347\u3002</p><p><strong>\u51fa\u751f\u4eba\u6570\u51cf\u5c11</strong></p><p>\u53bb\u5e74\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\u5b9e\u65bd\u7684\u7b2c\u4e8c\u5e74\u3002\u6839\u636e\u6b64\u524d\u6709\u5173\u65b9\u9762\u7684\u5224\u65ad\uff0c\u5168\u9762\u4e24\u5b69\u7684\u653f\u7b56\u6548\u679c\u4f53\u73b0\u6709\u6ede\u540e\u6027\uff0c\u5e94\u8be5\u57282017\u5e74\u4e4b\u540e\u9010\u6b65\u663e\u73b0\uff0c\u56e0\u6b642017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u4f1a\u660e\u663e\u9ad8\u4e8e2016\u5e74\u3002\u4f46\u4ece\u56fd\u5bb6\u7edf\u8ba1\u5c40\u516c\u5e03\u7684\u6570\u636e\u6765\u770b\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6bd42016\u5e74\u76841786\u4e07\u4eba\u51cf\u5c11\u4e8663\u4e07\u4eba\u3002</p><p>\u4eba\u53e3\u51fa\u751f\u7387\u4e5f\u540c\u6837\u51fa\u73b0\u4e86\u4e0b\u964d\u3002\u53bb\u5e74\u5168\u56fd\u4eba\u53e3\u51fa\u751f\u7387\u4e3a12.43\u2030\uff0c2016\u5e74\u8fd9\u4e00\u6570\u636e\u4e3a12.95\u2030\u3002</p><p>\u4e2d\u56fd\u793e\u4f1a\u79d1\u5b66\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u4eba\u53e3\u7edf\u8ba1\u5ba4\u4e3b\u4efb\u738b\u5e7f\u5dde\u8868\u793a\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u6bd42016\u5e74\u8fd8\u8981\u5c11\uff0c\u8fd9\u4e3b\u8981\u662f\u56e0\u4e3a\u4e00\u5b69\u51fa\u751f\u6570\u91cf\u4e0b\u964d\u5e45\u5ea6\u5f88\u5927\uff0c\u5982\u679c\u4e0d\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\uff0c\u51fa\u751f\u89c4\u6a21\u4e0b\u964d\u5e45\u5ea6\u4f1a\u66f4\u5927\u3002</p><p>\u957f\u671f\u5173\u6ce8\u751f\u80b2\u610f\u613f\u4e0e\u751f\u80b2\u884c\u4e3a\u7814\u7a76\u7684\u793e\u79d1\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u7814\u7a76\u5458\u90d1\u771f\u771f\u8868\u793a\uff0c2017\u5e74\u51fa\u73b0\u51fa\u751f\u4eba\u53e3\u7684\u4e0b\u964d\u8bf4\u660e\uff0c\u4e2a\u4eba\u751f\u80b2\u610f\u613f\u548c\u751f\u80b2\u884c\u4e3a\u53d7\u5230\u5f88\u591a\u590d\u6742\u56e0\u7d20\u7684\u5f71\u54cd\uff0c\u5305\u62ec\u7ecf\u6d4e\u80fd\u529b\u3001\u5e74\u9f84\u3001\u751f\u80b2\u504f\u597d\u7b49\u7b49\uff0c\u653f\u7b56\u5bf9\u751f\u80b2\u884c\u4e3a\u7684\u5f71\u54cd\u5e76\u6ca1\u6709\u539f\u6765\u9884\u60f3\u5f97\u5927\u3002</p><p><strong>\u8001\u9f84\u5316\u52a0\u901f</strong></p><p>\u6839\u636e\u56fd\u5bb6\u7edf\u8ba1\u5c40\u6570\u636e\u663e\u793a\uff0c\u4e2d\u56fd\u4eba\u53e3\u7684\u8001\u9f84\u5316\u7a0b\u5ea6\u6b63\u5728\u52a0\u901f\u52a0\u6df1\u30022017\u5e74\uff0c\u5168\u56fd\u4eba\u53e3\u4e2d60\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e324090\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768417.3%\uff0c\u5176\u4e2d65\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e315831\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768411.4%\u300260\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u548c65\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u90fd\u6bd4\u4e0a\u5e74\u589e\u52a0\u4e860.6\u4e2a\u767e\u5206\u70b9\u3002</p><p>\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd\u6301\u7eed\u964d\u4f4e\uff0c\u53bb\u5e7416\u81f359\u5468\u5c81\u7684\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u4e3a90199\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a64.9%\u30022016\u5e74\uff0c\u5168\u56fd\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u6570\u91cf\u4e3a90747\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a65.6%\u3002</p><p>\u7edf\u8ba1\u663e\u793a\uff0c\u53bb\u5e74\u4e2d\u56fd\u7684\u57ce\u9547\u5316\u901f\u5ea6\u5728\u6301\u7eed\u63d0\u9ad8\u3002\u57ce\u9547\u5e38\u4f4f\u4eba\u53e381347\u4e07\u4eba\uff0c\u6bd4\u4e0a\u5e74\u672b\u589e\u52a02049\u4e07\u4eba;\u4e61\u6751\u5e38\u4f4f\u4eba\u53e357661\u4e07\u4eba\uff0c\u51cf\u5c111312\u4e07\u4eba;\u57ce\u9547\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd(\u57ce\u9547\u5316\u7387)\u4e3a58.52%\uff0c\u6bd4\u4e0a\u5e74\u672b\u63d0\u9ad81.17\u4e2a\u767e\u5206\u70b9\u3002</p><p></p>"}, "cmtId": "D8GOCKJU0001899N"}


--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/spiders/newsspider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import scrapy
  5 | import re
  6 | from scrapy.selector import Selector
  7 | from crawl.items import NeteaseItem,TencentItem,SinaItem
  8 | from scrapy.http import Request
  9 | from urllib.request import urlopen
 10 | from crawl.maziclib.news_fun import ListCombiner
 11 | 
 12 | 
 13 | class NeteaseNewsSpider(scrapy.Spider):
 14 |     name = 'netease_news_spider'  #最后要调用的名字
 15 |     start_urls = ['http://news.163.com']
 16 |     allowed_domains = ['news.163.com']
 17 | 
 18 |     url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/(\d+)/(\w+)\.html'
 19 | 
 20 |     def parse(self, response):  # response即网页数据
 21 |         pat = re.compile(self.url_pattern)
 22 |         next_urls = re.findall(pat, str(response.body))
 23 | 
 24 |         ###debug
 25 |         #article = next_urls[0][0]+'/'+next_urls[0][1]+'/'+next_urls[0][2]+'/'+next_urls[0][3]+'/'+next_urls[0][4]+'.html'
 26 |         #yield Request(article, callback=self.parse_news)
 27 |         ###debug
 28 | 
 29 |         for next_url in next_urls:
 30 |             article = next_url[0]+'/'+next_url[1]+'/'+next_url[2]+'/'+next_url[3]+'/'+next_url[4]+'.html'
 31 |             yield Request(article,callback=self.parse_news)
 32 | 
 33 |     def parse_news(self, response):
 34 |         item = NeteaseItem()
 35 |         selector = Selector(response)
 36 |         pattern = re.match(self.url_pattern, response.url)
 37 | 
 38 |         
 39 |         source = 'netease'
 40 |         date = '20'+pattern.group(2)+pattern.group(3)
 41 |         newsId = pattern.group(5)
 42 |         cmtId = pattern.group(5)
 43 |         
 44 |         productKey = re.findall(re.compile(r'"productKey" : "(\w+)"'), str(response.body))[0]
 45 |         comments_api = 'http://comment.news.163.com/api/v1/products/' + productKey + '/threads/' + newsId
 46 |         boardId = re.findall(r'"boardId":"(\w+)"',str(urlopen(comments_api).read()))[0]
 47 |         comments = ('http://comment.news.163.com/'+boardId+'/'+newsId+'.html')
 48 | 
 49 |         item['source'] = 'netease'
 50 |         item['date'] = date
 51 |         item['newsId'] = newsId
 52 |         item['cmtId'] = cmtId
 53 |         #item['boardId'] = boardId
 54 |         item['comments'] = {'link' : comments}
 55 |         item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''}
 56 |         item['contents']['title'] = selector.xpath('//*[@id="epContentLeft"]/h1/text()').extract()
 57 |         item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="endText"]/p').extract())
 58 |         yield item
 59 | 
 60 | 
 61 | 
 62 | 
 63 | class TencentNewsSpider(scrapy.Spider):
 64 |     name = 'tencent_news_spider'  #最后要调用的名字
 65 |     start_urls = ['http://news.qq.com']
 66 |     allowed_domains = ['news.qq.com']
 67 | 
 68 |     #https://news.qq.com/a/20180120/000738.htm
 69 |     url_pattern = r'http://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm'
 70 | 
 71 |     def parse(self, response):  # response即网页数据
 72 |         pat = re.compile(self.url_pattern)
 73 |         next_urls = re.findall(pat, str(response.body))
 74 | 
 75 |         ### debug
 76 |         #article = 'http://'+next_urls[0][0]+'.qq.com/a/'+next_urls[0][1]+'/'+next_urls[0][2]+'.htm'
 77 |         #print(article)
 78 |         #yield Request(article,callback=self.parse_news)
 79 |         ### debug
 80 | 
 81 |         for next_url in next_urls:
 82 |             article = 'http://'+next_url[0]+'.qq.com/a/'+next_url[1]+'/'+next_url[2]+'.htm'
 83 |             yield Request(article,callback=self.parse_news)
 84 |        
 85 | 
 86 |     def parse_news(self, response):
 87 |         item = TencentItem()
 88 |         selector = Selector(response)
 89 |         url_pattern2 = r'(\w+)://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm'
 90 |         pattern = re.match(url_pattern2, str(response.url))
 91 |         
 92 |         source = 'tencent'
 93 |         date = pattern.group(3)
 94 |         newsId = pattern.group(4)
 95 |         cmtId = re.findall(re.compile(r'cmt_id = (\d+);'), str(response.body))[0]
 96 |         comments = 'http://coral.qq.com/' + cmtId
 97 | 
 98 | 
 99 |         item['source'] = source
100 |         item['date'] = date
101 |         item['newsId'] = newsId
102 |         item['comments'] = {'link' : comments}
103 |         item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''}
104 |         item['contents']['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()
105 |         item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="Cnt-Main-Article-QQ"]/p/text()').extract())  #这里要不要留下那些<tag>???(要不要/text()??)
106 |         print("-------------------------------")
107 |         print (date)
108 |         print(newsId)
109 |         print("-------------------------------")
110 |         yield item
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------