├── 2048 └── 2048.py ├── .gitignore ├── crawl ├── 暨南大学新闻爬虫 │ ├── jnuxshc │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── items.cpython-35.pyc │ │ │ ├── __init__.cpython-35.pyc │ │ │ └── settings.cpython-35.pyc │ │ ├── spiders │ │ │ ├── __pycache__ │ │ │ │ ├── xzhc.cpython-35.pyc │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ └── csv_item_exporter.cpython-35.pyc │ │ │ ├── __init__.py │ │ │ ├── csv_item_exporter.py │ │ │ └── xzhc.py │ │ ├── pipelines.py │ │ ├── items.py │ │ ├── settings.py │ │ └── middlewares.py │ ├── main.py │ ├── scrapy.cfg │ └── readme.md ├── news │ └── news_crawl │ │ ├── crawl │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── items.cpython-35.pyc │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── pipelines.cpython-35.pyc │ │ │ └── settings.cpython-35.pyc │ │ ├── spiders │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ └── newsspider.cpython-35.pyc │ │ │ ├── __init__.py │ │ │ └── newsspider.py │ │ ├── maziclib │ │ │ ├── __pycache__ │ │ │ │ └── news_fun.cpython-35.pyc │ │ │ └── news_fun.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── middlewares.py │ │ ├── readme.md │ │ ├── main.py │ │ ├── scrapy.cfg │ │ └── docs │ │ ├── netease │ │ ├── 20160602 │ │ │ └── BOIMS8PF00014JB5.json │ │ ├── 20160721 │ │ │ └── BSH7V8QF00014JB6.json │ │ ├── 20180116 │ │ │ └── D897H80K0001899O.json │ │ ├── 20180119 │ │ │ ├── D8HD3PFD0001875P.json │ │ │ ├── D8HLN6QA0001875P.json │ │ │ ├── D8H1O67B0001899N.json │ │ │ ├── D8HBI8IF0001875P.json │ │ │ ├── D8HJ2GAK000187VE.json │ │ │ ├── D8HAH1VS0001875P.json │ │ │ ├── D8HIR5JP0001875P.json │ │ │ ├── D8HJ6VRF0001875O.json │ │ │ └── D8GOCKJU0001899N.json │ │ └── 20180120 │ │ │ ├── D8J1VDAJ0001875P.json │ │ │ └── D8IUD7L60001899O.json │ │ └── tencent │ │ ├── 20160418 │ │ └── 023091.json │ │ ├── 20161227 │ │ ├── 012771.json │ │ ├── 014055.json │ │ ├── 007056.json │ │ ├── 012170.json │ │ └── 011065.json │ │ ├── 20171009 │ │ └── 039986.json │ │ ├── 20171129 │ │ └── 013590.json │ │ └── 20180120 │ │ ├── 006763.json │ │ ├── 002903.json │ │ ├── 004328.json │ │ ├── 003365.json │ │ ├── 010551.json │ │ ├── 006769.json │ │ ├── 010301.json │ │ ├── 009612.json │ │ └── 004124.json ├── 简书首页爬虫 │ ├── tutotial │ │ ├── tutotial │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ └── settings.cpython-35.pyc │ │ │ ├── spiders │ │ │ │ ├── __pycache__ │ │ │ │ │ └── __init__.cpython-35.pyc │ │ │ │ ├── __init__.py │ │ │ │ └── exampleSpider.py │ │ │ ├── pipelines.py │ │ │ ├── items.py │ │ │ ├── settings.py │ │ │ └── middlewares.py │ │ ├── scrapy.cfg │ │ └── readme.md │ └── jian.csv ├── 百度提交关键词.py ├── baidu_search.py ├── search.py └── getImage.py ├── 机器学习入门 ├── 无监督 │ ├── readme.md │ ├── cluster │ │ ├── readme.md │ │ ├── kmeans.py │ │ └── city.txt │ └── decomposition │ │ ├── readme.md │ │ └── PCA.py ├── keras │ ├── load_exist_model.py │ ├── my_model.h5 │ └── mnist.py ├── 强化学习 │ ├── readme.md │ └── Flappy Bird.py ├── readme.md ├── tensorflow │ ├── prac2.py │ └── prac1.py ├── 监督 │ ├── readme.md │ ├── 分类 │ │ ├── Bayes.py │ │ ├── KNN.py │ │ ├── DecisionTree.py │ │ └── 人体运动状态信息评级.py │ └── 回归 │ │ ├── prices.txt │ │ └── 房价预测.py ├── matplotlib使用.py ├── Numpy.py ├── label_propagation.py └── 标签传播算法(LP).py ├── python网络编程学习 ├── chapter1.py ├── chapter2.py ├── chapter3.py ├── chapter4.py ├── chapter3-2.py ├── chapter3-3.py └── chapter2 find.py ├── .idea ├── dictionaries │ └── mazic.xml ├── vcs.xml ├── misc.xml ├── modules.xml └── PycharmStudy.iml ├── grammar ├── readme.md ├── list.py ├── dictionary.py ├── set.py ├── Classes.py ├── Numpy │ └── Arrays.py └── liaoxuefeng.py ├── README.md ├── ACM └── cf │ ├── 672A 字符串第n个数.py │ ├── 1A 简单数学.py │ ├── 675A.py │ ├── 227A 叉积.py │ ├── 227B.py │ ├── 208A 字符串.py │ ├── 675B 填格子.py │ └── 675E DP+greedy.py ├── OS平台编程 ├── 遍历文件夹目录.py ├── 修改所有文件名字.py └── 自动调用程序.py ├── 泰迪杯尝试 ├── readability....py ├── re过滤html标签.py ├── 去除换行+空格.py ├── bbs.py ├── 1.py ├── 数据爬取(未处理).py ├── 爬取相似URL │ ├── 3.所有小URL初步信息去标签.py │ ├── 2.从相似URL中下载内容.py │ └── 从主页获得相似URL初步可执行代码.py ├── README.md ├── pyquery取全体文本.py └── 数据爬取(去标签).py ├── data structure ├── quickSort.py └── bubble sort.py ├── xslt提取网页数据.py ├── 验证码处理 ├── crack.py └── ascii.py └── Try cocos └── HelloWorld.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.h5 -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/readme.md: -------------------------------------------------------------------------------- 1 | 主目录在这里 2 | 运行请在该目录调用`python3 main.py` 3 | -------------------------------------------------------------------------------- /机器学习入门/无监督/readme.md: -------------------------------------------------------------------------------- 1 | 无监督两大主要任务 2 | - 聚类 cluster 3 | - 降维 decomposition 4 | -------------------------------------------------------------------------------- /机器学习入门/keras/load_exist_model.py: -------------------------------------------------------------------------------- 1 | from keras.models import load_model 2 | model = load_model('my_model.h5') -------------------------------------------------------------------------------- /python网络编程学习/chapter1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter1.py -------------------------------------------------------------------------------- /python网络编程学习/chapter2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter2.py -------------------------------------------------------------------------------- /python网络编程学习/chapter3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3.py -------------------------------------------------------------------------------- /python网络编程学习/chapter4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter4.py -------------------------------------------------------------------------------- /机器学习入门/keras/my_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/机器学习入门/keras/my_model.h5 -------------------------------------------------------------------------------- /python网络编程学习/chapter3-2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-2.py -------------------------------------------------------------------------------- /python网络编程学习/chapter3-3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-3.py -------------------------------------------------------------------------------- /.idea/dictionaries/mazic.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from scrapy import cmdline 5 | cmdline.execute("scrapy crawl xzhc".split()) 6 | -------------------------------------------------------------------------------- /grammar/readme.md: -------------------------------------------------------------------------------- 1 | ### Some tutorial 2 | #### Stanford 3 | http://cs231n.github.io/python-numpy-tutorial/#python 4 | #### liao 5 | https://www.liaoxuefeng.com/ -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /机器学习入门/强化学习/readme.md: -------------------------------------------------------------------------------- 1 | # 强化学习(Reinforcement) 2 | 根据环境学习不断调整,例如迷宫 3 | 4 | ### MDP(马尔科夫过程) 5 | - model-base 6 | 7 | 8 | ### 蒙特卡洛强化学习 9 | - model-free 10 | - 多次采样,取平均作为期望累计奖赏 -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/__pycache__/items.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/items.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/__pycache__/pipelines.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/pipelines.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/xzhc.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/xzhc.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /机器学习入门/无监督/cluster/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### cluster 3 | - K-means 4 | - DBSCAN 5 | - Gaussian Mixtures 6 | - Birch 7 | 8 | 9 | ```python 10 | from sklearn.cluster import KMeans 11 | ``` 12 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /grammar/list.py: -------------------------------------------------------------------------------- 1 | ## list (the same as array) 2 | nums = list(range(5)) 3 | squares = [x**2 for x in nums] 4 | even_squares = [x**2 for x in nums if x%2==0] 5 | print(squares) 6 | print(even_squares) 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/maziclib/__pycache__/news_fun.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/maziclib/__pycache__/news_fun.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/spiders/__pycache__/newsspider.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/news/news_crawl/crawl/spiders/__pycache__/newsspider.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/简书首页爬虫/tutotial/tutotial/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/csv_item_exporter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/crawl/暨南大学新闻爬虫/jnuxshc/spiders/__pycache__/csv_item_exporter.cpython-35.pyc -------------------------------------------------------------------------------- /机器学习入门/readme.md: -------------------------------------------------------------------------------- 1 | ### 学习视频 2 | >http://www.icourse163.org/course/BIT-1001872001 3 | >https://www.bilibili.com/video/av17204303 4 | 5 | ### numpy 6 | >http://cs231n.github.io/python-numpy-tutorial/#python 7 | 8 | ### -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/maziclib/news_fun.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | def ListCombiner(content): 5 | string = '' 6 | for e in content: 7 | string += e 8 | return string 9 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /机器学习入门/tensorflow/prac2.py: -------------------------------------------------------------------------------- 1 | # 创建交互式会话 2 | import tensorflow as tf 3 | 4 | sess = tf.InteractiveSession() 5 | a = tf.Variable([1.0,2.0]) # 变量数组 6 | b = tf.constant([3.0,4.0]) # 常量数组 7 | sess.run(tf.global_variables_initializer()) 8 | ans = tf.add(a,b) 9 | print(ans.eval()) -------------------------------------------------------------------------------- /机器学习入门/无监督/decomposition/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### decomposition 3 | - PCA (主成分,用鸢尾花的数据集) 4 | - FastICA 5 | - NMF (非负矩阵分解) 6 | - LDA 7 | 8 | ```python 9 | from sklearn.decomposition import PCA 10 | from sklearn.datasets import load_iris 11 | ``` 12 | 13 | ### 应用 14 | - 例如,给一个人脸图片,然后提取特征 15 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from scrapy import cmdline 5 | #cmdline.execute("scrapy crawl netease_news_spider".split()) 6 | #cmdline.execute("scrapy crawl tencent_news_spider".split()) 7 | cmdline.execute("scrapy crawl sina_news_spider".split()) 8 | -------------------------------------------------------------------------------- /crawl/百度提交关键词.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/16 19:35 4 | # @Author : mazicwong 5 | # @File : 百度提交关键词.py 6 | 7 | import requests 8 | kv = {'wd': 'python'} 9 | r = requests.get("http://www.baidu.com/s", params=kv) 10 | print(len(r.text)) 11 | -------------------------------------------------------------------------------- /机器学习入门/监督/readme.md: -------------------------------------------------------------------------------- 1 | # 监督两大主要任务 2 | 3 | ### 分类 (训练集,测试集) 4 | ##### 指标 5 | * 正确率: 针对预测结果, R=T/(T+F) 6 | * 召回率: 针对原来样本, R=T/(T+F) 7 | ##### 相关算法函数 8 | * knn 9 | * naivebayes 10 | * svm 11 | * decision tree 12 | * neural networks 13 | ##### 类别 14 | * 线性分类器 15 | * 非线性分类器 16 | 17 | 18 | ### 回归 19 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # It is some code about my Python Study 2 | ### in Python 3.5 3 | - **Grammar** 4 | - **Code about ACM** 5 | - cf (py3.5) 6 | - zoj (py2.7) 7 | - **data stucture** 8 | - bubble sort 9 | - KMP 10 | - **Spider** 11 | - **Algorithm about ML** 12 | - **Data mining** 13 | - **Machine Learning** 14 | -------------------------------------------------------------------------------- /grammar/dictionary.py: -------------------------------------------------------------------------------- 1 | # dictionary (the same as map) 2 | 3 | d = {'cat':'cute', 'dog':'furry'} 4 | print(d['cat']) 5 | for animal, type in d.items(): 6 | print('A %s is %s' % (animal,type)) 7 | 8 | 9 | nums = list(range(5)) 10 | even_num_to_square = {x:x**2 for x in nums if x%2==0} 11 | print(even_num_to_square) -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jnuxshc.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jnuxshc 12 | -------------------------------------------------------------------------------- /ACM/cf/672A 字符串第n个数.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:11 4 | # @Author : mazicwong 5 | # @File : 672A 字符串第n个数.py 6 | 7 | ''' 8 | 字符串1234.... 9 | 打印字符串的第n个数 10 | ''' 11 | k=int(input()) 12 | n='' 13 | x=1 14 | while len(n)<1000: 15 | n+=str(x) 16 | x+=1 17 | print(n[k-1]) -------------------------------------------------------------------------------- /OS平台编程/遍历文件夹目录.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | import os 4 | 5 | #打印所有文件的目录 6 | path = input("输入一个需要打印的路径") 7 | #os.walk 很常用,用来遍历一个目录,返回三元组 (路径,目录名,文件名) 8 | for root, dirs, files in os.walk(path): 9 | for name in files: 10 | print(os.path.join(root, name)) #os.path.join可以将路径和名字结合起来形成绝对路径 11 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawl 12 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutotial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutotial 12 | -------------------------------------------------------------------------------- /grammar/set.py: -------------------------------------------------------------------------------- 1 | # set (the same as set in cpp) 2 | 3 | from math import sqrt 4 | 5 | animals = {'cat', 'dog'} 6 | print('cat' in animals) 7 | animals.add('fish') # not append in list(array) 8 | 9 | for idx,animal in enumerate(animals): 10 | print('#%d %s' % (idx,animal)) 11 | 12 | nums = {int(sqrt(x)) for x in range(30)} 13 | print(nums) -------------------------------------------------------------------------------- /机器学习入门/tensorflow/prac1.py: -------------------------------------------------------------------------------- 1 | # 做矩阵乘法 2 | import tensorflow as tf 3 | 4 | mat1 = tf.constant([[3.,3.]]) # 1*2矩阵 5 | mat2 = tf.constant([[2.],[2.]]) # 2*1矩阵 6 | product = tf.matmul(mat1,mat2) # 创建op执行两个矩阵的乘法 7 | 8 | sess = tf.Session() # 在Session中执行图 9 | ans = sess.run(product) # 在图中执行op操作 10 | 11 | print(ans) 12 | sess.close() -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JnuxshcPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /OS平台编程/修改所有文件名字.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | import os 4 | 5 | path = input("输入一个路径") 6 | for root, dirs, files in os.walk(path): 7 | for name in files: 8 | fname, fext = os.path.splitext(name) # 用splitext分割文件名和扩展名 9 | os.rename(os.path.join(root, name), \ 10 | os.path.join(root, 'hdu ' + fname + fext)) 11 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TutotialPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /泰迪杯尝试/readability....py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/24 23:27 4 | # @Author : mazicwong 5 | # @File : readability....py 6 | 7 | import requests 8 | from readability import Document 9 | response = requests.get('http://www.bbsmax.com/A/kmzLB4DX5G/') 10 | doc = Document(response.text) 11 | print (doc.title()) 12 | print (doc.summary()) -------------------------------------------------------------------------------- /crawl/baidu_search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | try: 4 | kv = {'wd': 'Python'} 5 | r = requests.get("http://www.baidu.com/s", params=kv) 6 | print(r.encoding) 7 | r.raise_for_status() 8 | r.enconding = r.apparent_encoding 9 | print(r.enconding) 10 | print("length of the whole source code : %s " %len(r.text)) 11 | except: 12 | print( "there must be a wrong") 13 | -------------------------------------------------------------------------------- /data structure/quickSort.py: -------------------------------------------------------------------------------- 1 | def quicksort(arr): 2 | if len(arr)<=1: 3 | return arr 4 | pivot = arr[len(arr)//2] 5 | left = [x for x in arr if x < pivot] 6 | right = [x for x in arr if x > pivot] 7 | middle = [x for x in arr if x == pivot] 8 | return quicksort(left) + middle + quicksort(right) 9 | 10 | if __name__ == '__main__': 11 | print(quicksort([3,6,7,9,1,3,1])) -------------------------------------------------------------------------------- /机器学习入门/监督/分类/Bayes.py: -------------------------------------------------------------------------------- 1 | # 朴素贝叶斯: 生成学习方法 2 | # 学习联合概率分布,求后验概率分布 3 | # 参数 4 | # priors: 先验概率 5 | 6 | import numpy as np 7 | from sklearn.naive_bayes import GaussianNB #朴素bayes 8 | X = np.array([[-1,-1], [-1,-1], [-3,-2], [1,1], [2,1], [3,2]]) 9 | y = np.array([1,1,1,2,2,2]) 10 | 11 | #训练 12 | clf = GaussianNB(priors=None) #默认参数,创建分类器 13 | clf.fit(X,y) 14 | 15 | #预测 16 | print(clf.predict([[-0.8,-1]])) 17 | -------------------------------------------------------------------------------- /data structure/bubble sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | 4 | # this is a simple Python bubble sort 5 | 6 | array = [1, 2, 3, 5, 4, 6, 9, 8, 7] 7 | for i in range(len(array) - 1, 0, -1): 8 | for j in range(0, i): 9 | if array[j] > array[j + 1]: 10 | array[j], array[j + 1] = array[j + 1], array[j] 11 | # so cool compared to CPP... 12 | 13 | print(array) -------------------------------------------------------------------------------- /grammar/Classes.py: -------------------------------------------------------------------------------- 1 | class Greeter(object): 2 | # Constructor 3 | def __init__(self, name): 4 | self.name = name 5 | # Instance method (实例方法) 6 | def greet(self, loud = False): 7 | if loud: 8 | print('Hello, %s !' % self.name.upper()) 9 | else: 10 | print('Hello, %s' % self.name) 11 | 12 | g = Greeter('Fred') 13 | g.greet() 14 | g.greet(loud=True) 15 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item,Field 9 | 10 | 11 | class JnuxshcItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = Field() 15 | time = Field() 16 | intro = Field() 17 | 18 | -------------------------------------------------------------------------------- /机器学习入门/监督/分类/KNN.py: -------------------------------------------------------------------------------- 1 | # KNN,取与已知点最近的k个点,看占据哪个类别的比例多 2 | # 参数 3 | # n_neighbors: K(默认5) 4 | # weights: K个点对结果的影响权重(默认平均权重uniform) 5 | # algorithm: 计算临近点方法(ball_tree,kd_tree,brute) 6 | # 7 | 8 | from sklearn.neighbors import KNeighborsClassifier 9 | from sklearn.datasets import load_iris 10 | 11 | # 训练 12 | X = [[0],[1],[2],[3]] 13 | y = [0,0,1,1] 14 | clf = KNeighborsClassifier(n_neighbors=3) # k=3 15 | clf.fit(X,y) # 学习 16 | 17 | # 使用 18 | print(clf.predict([[1.1]])) 19 | -------------------------------------------------------------------------------- /ACM/cf/1A 简单数学.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 0:53 4 | # @Author : mazicwong 5 | # @File : 1A 简单数学.py 6 | 7 | ''' 8 | give : n,m,a a retangle with n*m and use how many square with a*a to patch up with it 9 | (can be overlap) 10 | http://blog.csdn.net/chenguolinblog/article/details/12190689 11 | ''' 12 | 13 | myList = input().split() 14 | n=int(myList[0]) 15 | m=int(myList[1]) 16 | a=int(myList[2]) 17 | 18 | print((n//a+(n%a>0))*(m//a+(m%a>0))) -------------------------------------------------------------------------------- /grammar/Numpy/Arrays.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | a = np.array([1,2,3]) 4 | print(type(a)) # all is same type 5 | print(a.shape) # the shape of an array is a tuple of integers giving the size of the array along each dimension. 6 | print(a[0],a[1]) 7 | 8 | b = np.array([[1,2,3],[4,5,6]]) 9 | print(b.shape) 10 | 11 | # functions to create array 12 | c = np.zeros((2,2)) 13 | d = np.ones((1,2)) 14 | e = np.full((2,2), 7) 15 | f = np.eye(2) # identity matrix 16 | g = np.random.rand((2,2)) #random value -------------------------------------------------------------------------------- /ACM/cf/675A.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:16 4 | # @Author : mazicwong 5 | # @File : 675A.py 6 | ''' 7 | 给出a,b,c,求是否a加若干个c能得到b,是就输出YES,否就输出NO 8 | 解答: (b-a)%c==0 9 | ''' 10 | 11 | a, b, c = map(int, input().split(' ')) 12 | if ((a != b and c == 0) or (b > a and c < 0)): 13 | print("NO") 14 | elif ((a == b) or (b > a and c > 0 and ((b - a) % c == 0)) or (a > b and c < 0 and ((a - b) % c == 0))): 15 | print("YES") 16 | else: 17 | print("NO") 18 | -------------------------------------------------------------------------------- /xslt提取网页数据.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/12 16:39 4 | # @Author : mazicwong 5 | # @File : xslt提取网页数据.py 6 | # https://zhuanlan.zhihu.com/p/20869884 7 | 8 | ''' 9 | https://zhuanlan.zhihu.com/p/20869884 10 | lxml是python的一个库,可以迅速、灵活地处理 XML。 11 | 提取集搜客官网旧版论坛的帖子标题和回复数,把整个列表提取出来,存成xml格式 12 | ''' 13 | from urllib.request import urlopen 14 | from lxml import etree 15 | url="http://www.gooseeker.com/cn/forum/7" 16 | html = urlopen(url) 17 | doc=etree.HTML(html.read()) -------------------------------------------------------------------------------- /ACM/cf/227A 叉积.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 12:59 4 | # @Author : mazicwong 5 | # @File : 227A 叉积.py 6 | 7 | ''' 8 | 本题输入三个点坐标,考察叉积,若大于0则right,小于0则left,等于0则towards 9 | ''' 10 | 11 | ax,ay = map(int,input().split(' ')) 12 | bx,by = map(int,input().split(' ')) 13 | cx,cy = map(int,input().split(' ')) 14 | x1=ax-bx 15 | y1=cx-bx 16 | x2=ay-by 17 | y2=cy-by 18 | ans=x1*y2-x2*y1 19 | if ans>0: 20 | print("RIGHT") 21 | elif ans<0: 22 | print("LEFT") 23 | else: 24 | print("TOWARDS") -------------------------------------------------------------------------------- /ACM/cf/227B.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:00 4 | # @Author : mazicwong 5 | # @File : 227B.py 6 | 7 | ''' 8 | n 9 | a1,a2...an 10 | q 11 | b1,b2...bq 12 | ''' 13 | n = int(input()) 14 | mylist = input().split(' ') 15 | i = 0 16 | zid = {} 17 | for x in mylist: 18 | zid[x] = i 19 | i += 1 20 | q = int(input()) 21 | m = input().split(' ') 22 | ans1 = 0 23 | ans2 = 0 24 | for y in m: 25 | tmp = zid[y] 26 | ans1 += tmp + 1 27 | ans2 += n - tmp 28 | print(ans1, ans2) 29 | -------------------------------------------------------------------------------- /OS平台编程/自动调用程序.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | import sched, time 4 | 5 | 6 | # sched的函数不超过10个,但都很好用 7 | def print_time(msg='default'): 8 | print("当前时间", time.time(), msg) 9 | 10 | 11 | # sched.scheduler() 用来创建一个调度任务 12 | s = sched.scheduler(time.time, time.sleep) 13 | print(time.time()) 14 | s.enter(5, 1, print_time, argument=('延迟5秒,优先级1',)) # 时间间隔,执行优先级,调用的函数,函数参数 15 | s.enter(3, 2, print_time, argument=('延迟3秒,优先级2',)) 16 | s.enter(3, 1, print_time, argument=('延迟3秒,优先级1',)) 17 | s.run() # 执行调度事件 18 | print(time.time()) 19 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/readme.md: -------------------------------------------------------------------------------- 1 | 无聊复习下爬虫,用scrapy爬取了简书的热门文章,后面可以继续添加内容 2 | 3 | `scrapy startproject tutotial`之后,要用到的就是进sina里面修改了 4 | 其中文件: 5 | - items.py: 修改为需要获得的数据 6 | - pipelines.py: 不管 7 | - settings.py: 设置了获取数据储存的地方,修改`robots`,`user_agent`等 8 | - middlewares.py: 9 | - spiders/: 真正爬虫代码,可以用xpath,selector等处理,记得放入item中 10 | 11 | 12 | `scrapy crawl example.py`,spiders文件夹中爬虫代码 13 | 14 | Some Problem: 15 | 1. 一开始运行完空白,看到debug中返回403,然后到settings.py里修改`user_agent`就好了 16 | 2. 然后运行完还是爬不到,在settings把robots.txt修改为False就好了 17 | 3. 第三个错误就是xpath写错的原因了,以后注意就行 18 | -------------------------------------------------------------------------------- /.idea/PycharmStudy.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /机器学习入门/监督/分类/DecisionTree.py: -------------------------------------------------------------------------------- 1 | # 决策树 2 | # 参数 3 | # criterion: gini(基尼系数)/entropy(信息增益) 4 | # max_features: 节点处分裂时,从多少个特征选择最优特征,默认使用所有特征个数 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | clf = DecisionTreeClassifier() #默认gini 11 | iris = load_iris() 12 | data = iris.data # 数据 13 | target = iris.target # 标签作为目标结构 14 | 15 | #训练 16 | # 10则交叉验证 17 | cross_val_score(clf, iris.data, iris.target, cv=10) 18 | clf.fit(X,y) 19 | 20 | #预测 21 | print(clf.predict(X)) -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item,Field 9 | 10 | 11 | class TutotialItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = Field() 15 | author = Field() 16 | time = Field() 17 | ''' 18 | url = Field() 19 | readNum = Field() 20 | commentNum = Field() 21 | likeNum = Field() 22 | ''' 23 | -------------------------------------------------------------------------------- /python网络编程学习/chapter2 find.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/12 16:13 4 | # @Author : mazicwong 5 | # @File : chapter2 find.py 6 | 7 | from urllib.request import urlopen 8 | from bs4 import BeautifulSoup 9 | 10 | html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") 11 | bs0bj = BeautifulSoup(html, "html.parser") 12 | # use findall to get a 'list' containing those only appeared in 13 | nameList = bs0bj.findAll("span", {"class": "green"}) 14 | for name in nameList: 15 | print(name.get_text()) 16 | -------------------------------------------------------------------------------- /ACM/cf/208A 字符串.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/21 22:44 4 | # @Author : mazicwong 5 | # @File : 208A 字符串.py 6 | # 将原字符串中的“WUB”子串去掉 7 | ''' 8 | input()输入string,如果要读一个数字的话,要用int()转为数字 int(input()) 9 | a = str.split(sss) 将原串按sss进行分割,然后存到的到子串存到一个集合当中 10 | eg: str ="a$b$c" a = str.split('$') a=[a,'',b,'',c] 11 | ''' 12 | 13 | 14 | print (input().replace('WUB', ' ')) 15 | 16 | ''' 17 | str = input() 18 | str.encode('UTF-8') 19 | a = [] 20 | a = str.split('WUB') 21 | for t in a: 22 | if t != '': 23 | print(t, end=' ')#print默认\n结尾,给换成空格就好 24 | ''' 25 | 26 | -------------------------------------------------------------------------------- /ACM/cf/675B 填格子.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:29 4 | # @Author : mazicwong 5 | # @File : 675B 填格子.py 6 | 7 | ''' 8 | 有个3*3的九宫格,每个格子能填1~n中任意的数(n由输入给出)。要求其中任意2*2的格子中4个数的和与其他各个2*2格子都相等 9 | 解法: 对中间的数进行枚举 10 | ''' 11 | 12 | 13 | def solve(): 14 | n, a, b, c, d = map(int, input().split()) 15 | ans = 0 16 | for i in range(1, n + 1): 17 | t = i + a + b 18 | if t - a - c > 0 and t - a - c <= n and t - c - d > 0 and t - c - d <= n and t - b - d > 0 and t - b - d <= n: 19 | ans += 1 20 | return ans * n 21 | 22 | 23 | print(solve()) 24 | -------------------------------------------------------------------------------- /机器学习入门/监督/回归/prices.txt: -------------------------------------------------------------------------------- 1 | 1000,168 2 | 792,184 3 | 1260,197 4 | 1262,220 5 | 1240,228 6 | 1170,248 7 | 1230,305 8 | 1255,256 9 | 1194,240 10 | 1450,230 11 | 1481,202 12 | 1475,220 13 | 1482,232 14 | 1484,460 15 | 1512,320 16 | 1680,340 17 | 1620,240 18 | 1720,368 19 | 1800,280 20 | 4400,710 21 | 4212,552 22 | 3920,580 23 | 3212,585 24 | 3151,590 25 | 3100,560 26 | 2700,285 27 | 2612,292 28 | 2705,482 29 | 2570,462 30 | 2442,352 31 | 2387,440 32 | 2292,462 33 | 2308,325 34 | 2252,298 35 | 2202,352 36 | 2157,403 37 | 2140,308 38 | 4000,795 39 | 4200,765 40 | 3900,705 41 | 3544,420 42 | 2980,402 43 | 4355,762 44 | 3150,392 45 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/csv_item_exporter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | #用来规定输出到csv顺序的 5 | from scrapy.conf import settings 6 | from scrapy.contrib.exporter import CsvItemExporter 7 | 8 | class MyProjectCsvItemExporter(CsvItemExporter): 9 | 10 | def __init__(self, *args, **kwargs): 11 | delimiter = settings.get('CSV_DELIMITER', ',') 12 | kwargs['delimiter'] = delimiter 13 | 14 | fields_to_export = settings.get('FIELDS_TO_EXPORT', []) 15 | if fields_to_export : 16 | kwargs['fields_to_export'] = fields_to_export 17 | 18 | super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs) 19 | -------------------------------------------------------------------------------- /crawl/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | import requests 4 | kv = {'wd':'Python'} 5 | q = requests.get("http://www.baidu.com/s",params = kv) 6 | 7 | q.status_code 8 | def get_URL(url): 9 | try: 10 | r=requests.get(url,timeout=30) 11 | print(r.encoding) 12 | r.raise_for_status() 13 | r.enconding=r.apparent_encoding 14 | return r.text[:1000] 15 | except: 16 | return "there must be a wrong" 17 | 18 | if __name__=="__main__": 19 | url="https://detail.tmall.com/item.htm?spm=a223c.8145724.1110321729.1.Qz7Kic&acm=lb-zebra-175981-1643283.1003.4.1365015&id=537259409492&scm=1003.4.lb-zebra-175981-1643283.ITEM_537259409492_1365015" 20 | print(get_URL(url)) 21 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item,Field 9 | 10 | class NewsItem(Item): 11 | source = Field() 12 | date = Field() 13 | newsId = Field() 14 | cmtId = Field() 15 | contents = Field() 16 | comments = Field() 17 | 18 | class CrawlItem(Item): 19 | # define the fields for your item here like: 20 | # name = scrapy.Field() 21 | pass 22 | 23 | class NeteaseItem(NewsItem): 24 | boardId = Field() 25 | 26 | class TencentItem(NewsItem): 27 | pass 28 | 29 | class SinaItem(NewsItem): 30 | channelId = Field() 31 | -------------------------------------------------------------------------------- /crawl/getImage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | import requests 4 | import os 5 | 6 | root = "D://pics//" 7 | url = "http://imgsize.ph.126.net/?enlarge=true&imgurl=http://edu-image.nosdn.127.net/73946898DEFC4EEE8B934F5DA131B905.jpg?imageView&thumbnail=426y240&quality=100_230x130x1x95.png" 8 | path = root + url.split('/')[-1] 9 | try: 10 | if not os.path.exists(root): 11 | os.mkdir(root) 12 | if not os.path.exists(path): 13 | r = requests.get(url) 14 | # 图片是二进制格式,把图片保存为文件 15 | with open(path, 'wb') as f: 16 | f.write(r.content) 17 | f.close() 18 | print("successfully saving") 19 | else: 20 | print ("The file is already existing") 21 | except: 22 | print("a faulty operation ") -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### JNU学生荟萃板块爬虫 3 | *采用scrapy* 4 | URL : https://news.jnu.edu.cn/xshc/ll 5 | 6 | #### 使用方式 7 | `scrapy startproject jnuxshc`之后,要用到的就是进sina里面修改了 8 | `scrapy crawl mazic.py`,spiders中爬虫代码,这里用`main.py`来执行了 9 | *最终接口*,调用`python3 main.py`,会得到一个`jnu.csv`的文件 10 | 11 | #### 需要修改的文件: 12 | - items.py: 修改为需要获得的数据 13 | - pipelines.py: 暂时不管 14 | - settings.py: 数据存储的地方和格式,修改`robots`,`user_agent` 15 | - middlewares.py: 暂时不管 16 | - spiders/***.py: 真正爬虫代码,可以用xpath,selector等处理,记得放入item中 17 | 18 | 19 | #### Some Problem: 20 | 1. 一开始运行完空白,看到debug中返回403,然后到settings.py里修改`user_agent`就好了 21 | 2. 然后运行完还是爬不到,在settings把robots.txt修改为False就好了 22 | 3. 第三个错误就是xpath写错的原因了,以后注意就行 23 | 4. 由于输出到csv的列是无序的,所以在spiders/中加了`csv_item_exporter.py`,在`settings.py`中添加了`FEED_EXPORTERS`和`FIELDS_TO_EXPORT` 24 | 25 | -------------------------------------------------------------------------------- /泰迪杯尝试/re过滤html标签.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/13 0:22 4 | # @Author : mazicwong 5 | # @File : re过滤html标签.py 6 | 7 | 8 | 9 | from html.parser import HTMLParser 10 | from bs4 import BeautifulSoup 11 | from urllib import request 12 | class MLStripper(HTMLParser): 13 | def __init__(self): 14 | self.reset() 15 | self.strict = False 16 | self.convert_charrefs= True 17 | self.fed = [] 18 | def handle_data(self, d): 19 | self.fed.append(d) 20 | def get_data(self): 21 | return ''.join(self.fed) 22 | 23 | def strip_tags(html): 24 | s = MLStripper() 25 | s.feed(html) 26 | return s.get_data() 27 | 28 | url = "http://x.heshuicun.com/forum.php?mod=viewthread&tid=80" 29 | html = request.urlopen(url) 30 | bsObj = BeautifulSoup(html) 31 | strip_tags(bsObj) -------------------------------------------------------------------------------- /机器学习入门/强化学习/Flappy Bird.py: -------------------------------------------------------------------------------- 1 | # Deep Q-Network 2 | # 深度强化学习进行Flappy Bird游戏的训练 3 | # tensorflow + pygame +cv2 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | from collections import deque 8 | import random 9 | import sys 10 | sys.path.append('/home/mazic/Downloads/FlappyBirdClone') 11 | # import wrapped_flappy_bird as game 12 | import cv2 13 | import pygame 14 | 15 | GAME = 'bird' 16 | ACTIONS = 2 17 | GAMMA = 0.99 18 | OBSERVE = 10000. 19 | EXPLORE = 3000000. 20 | FINAL_EPSILON = 0.0001 21 | INITIAL_EPSILON = 0.0001 22 | REPLAY_MEMORY = 50000 23 | BATCH = 32 24 | FRAME_PER_ACTION = 1 25 | 26 | mat1 = tf.constant([[3.,3.]]) # 1*2矩阵 27 | mat2 = tf.constant([[2.],[2.]]) # 2*1矩阵 28 | product = tf.matmul(mat1,mat2) # 创建op执行两个矩阵的乘法 29 | sess = tf.Session() # 在Session中执行图 30 | res = sess.run(product) # 在图中执行op操作 31 | 32 | print(res) 33 | sess.close() 34 | -------------------------------------------------------------------------------- /grammar/liaoxuefeng.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | 4 | print('this is', 'a learning', 'process') 5 | name = input("input your name : ") 6 | a = input("input your age : ") 7 | age = int(a) 8 | print('hello: %s , %d ' % (name, age)) # 用%中间不用逗号...it is not C 9 | 10 | # ****常用数据类型****# 11 | # list [] 数组 append(),insert(1,'ma'),pop(), 12 | classmates = ['mazic'] 13 | classmates.append('cpp') 14 | print(classmates[-1]) 15 | classmates.pop() 16 | L = list(range(100)) #共0~99 17 | L = L[3:10:2] #第三到第十个数,每两个取一次(这种切片操作可用于list,tuple,str) 18 | # tuple () 定长数组 =>就是比较安全而已 19 | 20 | for name in classmates: # for name in range(101) 21 | print(name) 22 | for i,value in enumerate(['A','B','C']): 23 | print(i,value) 24 | 25 | # dict 字典(即map),一组key+value 26 | d = {'mazic': 100, 'java': 6, 'cpp': 99} 27 | # set 一组key,但是没有重复的key #add(3),remove(4) 28 | s = set([1, 2, 3]) 29 | ss 30 | 31 | #from 库 import 函数 -------------------------------------------------------------------------------- /泰迪杯尝试/去除换行+空格.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/28 0:15 4 | # @Author : mazicwong 5 | # @File : 去除换行+空格.py 6 | 7 | ''' 8 | 源码编码判断用chardet,取出换行和空格用strip 9 | ''' 10 | 11 | import re 12 | 13 | # s = "as, \n asdas \n \n \n \n\nasda" 14 | # print(s) 15 | # print(".............") 16 | # s = ''.join(re.split(' +', s)) 17 | # s = '\n'.join(re.split('\n+', s)) 18 | # print(s) 19 | # print ('\n'.join(re.split(' +',s))) 20 | 21 | 22 | with open(r'C:\Users\ASUS\Desktop\66out-1.txt', 'r') as file: 23 | str = file.read() 24 | str = '\n'.join(re.split(' +', str)) 25 | str = '\n'.join(re.split('\t+', str)) 26 | str = '\n'.join(re.split('\r+', str)) 27 | str = '\n'.join(re.split(' ', str)) 28 | str = '\n'.join(re.split('\n+', str)) 29 | print(str) 30 | 31 | file1 = open(r'C:\Users\ASUS\Desktop\666-1.txt', 'w') 32 | file1.write(str) 33 | file1.close() 34 | -------------------------------------------------------------------------------- /验证码处理/crack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/29 16:21 4 | # @Author : mazicwong 5 | # @File : crack.py 6 | 7 | #https://www.shiyanlou.com/courses/364 8 | 9 | from PIL import Image 10 | im = Image.open("Code.png") 11 | im = im.convert("P") #converting an “RGB” image to an 8-bit palette image 12 | print (im.histogram()) #打印颜色直方图 13 | #发现很多白点,每个点是256色,最后一个显示920,说明有920个白色像素 14 | his = im.histogram() 15 | values={} 16 | for i in range(255): 17 | values[i] = his[i] 18 | 19 | #排序得到有用的颜色,发现 211,741 这个就是我们要的验证码的红色部分了 20 | for j,k in sorted(values.items(),key=lambda x:x[1],reverse = True)[:10]: 21 | print(j,k) 22 | 23 | 24 | #构造黑白二值图片 25 | im2 = Image.new("P",im.size,255) 26 | 27 | for x in range(im.size[1]): 28 | for y in range(im.size[0]): 29 | pix = im.getpixel((y,x)) 30 | if pix == 1 or pix ==2: 31 | im2.putpixel((y,x),0) 32 | 33 | im2.show() 34 | 35 | -------------------------------------------------------------------------------- /机器学习入门/无监督/decomposition/PCA.py: -------------------------------------------------------------------------------- 1 | # 主成分分析 2 | # 矩阵的主成分即协方差矩阵对应的特征向量 3 | # 对鸢尾花数据降维(4->2) 4 | 5 | import matplotlib.pyplot as plt #可视化 6 | import numpy as np 7 | from sklearn.decomposition import PCA 8 | from sklearn.datasets import load_iris #数据集 9 | 10 | data = load_iris() 11 | y = data.target 12 | X = data.data 13 | pca = PCA(n_components=2) 14 | reduced_X = pca.fit_transform(X) 15 | 16 | red_x, red_y = [], [] 17 | blue_x, blue_y = [], [] 18 | green_x, green_y = [], [] 19 | 20 | 21 | for i in range(len(reduced_X)): 22 | if y[i] == 0: 23 | red_x.append(reduced_X[i][0]) 24 | red_y.append(reduced_X[i][1]) 25 | elif y[i] == 1: 26 | blue_x.append(reduced_X[i][0]) 27 | blue_y.append(reduced_X[i][1]) 28 | else: 29 | green_x.append(reduced_X[i][0]) 30 | green_y.append(reduced_X[i][1]) 31 | 32 | plt.scatter(red_x, red_y, c='r', marker='x') 33 | plt.scatter(blue_x, blue_y, c='b', marker='D') 34 | plt.scatter(green_x, green_y, c='g', marker='.') 35 | plt.show() 36 | -------------------------------------------------------------------------------- /机器学习入门/无监督/cluster/kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | 4 | def loadData(filepath): 5 | f = open(filepath,'r+') 6 | lines = f.readlines() 7 | retData = [] 8 | retCityName = [] 9 | for line in lines: 10 | items = line.strip().split() 11 | retCityName.append(items[0]) 12 | # retData.append([float(items[i])] for i in range(1,len(items))) 13 | retData.append([float(items[i]) for i in range(1, len(items))]) 14 | return retData,retCityName 15 | 16 | if __name__ == '__main__': 17 | data,cityName = loadData('city.txt') 18 | km = KMeans(n_clusters=4) # 聚类中心 19 | label = km.fit_predict(data) # 获取每一条数据的聚类标签 20 | expenses = np.sum(km.cluster_centers_, axis=1) 21 | CityCluster = [[], [], [], []] # 城市按label分成簇 22 | for i in range(len(cityName)): 23 | CityCluster[label[i]].append(cityName[i]) 24 | for i in range(len(CityCluster)): 25 | print("Expenses:%.2f" % expenses[i]) 26 | print(CityCluster[i]) -------------------------------------------------------------------------------- /Try cocos/HelloWorld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/29 15:04 4 | # @Author : mazicwong 5 | # @File : HelloWorld.py 6 | 7 | import cocos 8 | 9 | class HelloWorld(cocos.layer.Layer): 10 | def __init__(self): 11 | super(HelloWorld,self).__init__() 12 | 13 | #新建文字标签用于显示helloworld 14 | label = cocos.text.Label('Hello,world', 15 | font_name = 'Times New Roman', 16 | font_size = 32, 17 | anchor_x='center', 18 | anchor_y='center' 19 | ) 20 | label.position = 320,240 21 | self.add(label) 22 | 23 | cocos.director.director.init() #新建一个窗口 24 | main_scene = cocos.scene.Scene(HelloWorld())#新建场景,场景里只有一个层hello_layer 25 | cocos.director.director.run(main_scene) #开始工作 26 | 27 | # class PPX(cocos.sprite.Sprite): 28 | # def __init__(self): 29 | # super(PPX,self).__init__('ppx.png') 30 | -------------------------------------------------------------------------------- /泰迪杯尝试/bbs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/12 16:55 4 | # @Author : mazicwong 5 | # @File : bbs.py 6 | # datas = file('result_sample.txt').readlines() 7 | 8 | from urllib import request 9 | from bs4 import BeautifulSoup 10 | import re 11 | 12 | url = "http://x.heshuicun.com/forum.php?mod=viewthread&tid=80" 13 | headers = { 14 | 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36', 15 | 'Referer': r'http://x.heshuicun.com/forum.php?mod=viewthread&tid=80', 16 | } 17 | req = request.Request(url, headers=headers) 18 | page = request.urlopen(req).read() 19 | # page = page.decode('utf-8') 20 | 21 | 22 | # html = urlopen(url) 23 | # page = html.read() 24 | # bs0bj = BeautifulSoup(html, "html.parser") 25 | # print(html) 26 | # pattern = re.compile(r"^\d{4}(-\d\d){2} \d\d(:\d\d){2}") 27 | # match = pattern.match('2015-05-22 17:43:50') 28 | # mmm = re.match(r"^\d{4}(-\d\d){2} \d\d(:\d\d){2}",page) 29 | # print (match.group()) 30 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/jian.csv: -------------------------------------------------------------------------------- 1 | title,time,author 2 | 【陌生人】枫叶,2017-12-22T06:44:01+08:00,原始生命 3 | 如何有效阅读一本书?(附思维导图),2018-01-14T20:03:35+08:00,平白书 4 | 周杰伦:等你下课,勇敢追梦,2018-01-18T07:45:10+08:00,潘城王小古 5 | 请停止无效社交——移动互联网时代,如何建立自己的人脉关系,2018-01-18T08:02:33+08:00,萌薇 6 | 一张画彻底改变了我的后半生,也可能改变你的,2018-01-18T12:08:20+08:00,心蓝丫头 7 | 那些懂得有效学习的人,永远不会被社会淘汰,2018-01-18T08:19:23+08:00,Nicole林小白 8 | 社会如此不公平,教你几种面对竞争的博弈方法,2018-01-18T10:21:57+08:00,道长是名思维贩子 9 | 作为背包客,我是一个像空气一样自由的人,2018-01-18T11:09:31+08:00,有备而来的路人甲 10 | 《十二夜》:爱情所有的样子,这里都有,2018-01-11T16:33:22+08:00,南有南风 11 | 新年“剧”场|琅琊风起,吸海垂虹,2018-01-13T00:51:36+08:00,覃浠 12 | 成长不是站在起点去选择,而是在过程中去把握,2018-01-18T15:48:09+08:00,韩大爷的杂货铺 13 | 僧人与屠夫,2018-01-12T08:49:55+08:00,从心活过 14 | 诗‖和平下的战争,2018-01-11T21:59:31+08:00,半岛雪 15 | 周杰伦《等你下课》了,能不能把青春还给我?,2018-01-18T08:51:37+08:00,衷曲无闻 16 | 初恋这件小事,2018-01-12T23:56:12+08:00,尊敬的王二 17 | 【古风】帝王的妻姐(47),2018-01-13T07:00:28+08:00,无疾不伤 18 | 1、鸡场奇迹,2018-01-18T11:20:28+08:00,修道院羔羊 19 | 小程序学习笔记2-使用weui开发小程序,2018-01-12T15:50:17+08:00,Doris_Lee 20 | 岁生之初,且听我闲扯,2018-01-04T20:20:46+08:00,一浅疏影 21 | 二十多岁的我们,拥有多少存款?,2018-01-18T07:49:35+08:00,羊达令 22 | -------------------------------------------------------------------------------- /验证码处理/ascii.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/29 17:13 4 | # @Author : mazicwong 5 | # @File : ascii.py 6 | 7 | 8 | from PIL import Image 9 | 10 | # 图片路径/名称 11 | path = "char1.png" 12 | # 字符集 13 | ascii_char = list("$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\|()1{}[]?-_+~<>i!lI;:,\"^`'. ") 14 | 15 | 16 | # RGB值转字符的函数 17 | def get_char(r, g, b, alpha=256): 18 | if alpha == 0: 19 | return ' ' 20 | length = len(ascii_char) 21 | gray = int(0.2126 * r + 0.7152 * g + 0.0722 * b) 22 | unit = (256.0 + 1) / length 23 | return ascii_char[int(gray / unit)] 24 | 25 | 26 | if __name__ == '__main__': 27 | im = Image.open(path) 28 | #WIDTH, HEIGHT = im.size 29 | WIDTH, HEIGHT = 80,80 30 | print(WIDTH, HEIGHT) 31 | im = im.resize((HEIGHT, WIDTH), Image.NEAREST) # 使用最近滤波 32 | txt = "" 33 | for h in range(HEIGHT): 34 | for w in range(WIDTH): 35 | txt += get_char(*im.getpixel((w, h))) 36 | txt += '\n' 37 | print(txt) 38 | 39 | with open("output.txt", "w") as f: 40 | f.write(txt) 41 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/spiders/exampleSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | from tutotial.items import TutotialItem 5 | 6 | 7 | class Example(scrapy.Spider): 8 | name = 'example' 9 | start_urls=['http://www.jianshu.com'] 10 | url = 'http://www.jianshu.com' 11 | 12 | def parse(self, response): # response即网页数据 13 | item = TutotialItem() 14 | selector = Selector(response) 15 | articles = selector.xpath('//*[@id="list-container"]/ul/li') 16 | print("huangzhiqihuangzhiqi-----") 17 | 18 | for article in articles: 19 | title = article.xpath('div/a/text()').extract() 20 | author = article.xpath('div/div[1]/div/a[1]/text()').extract() 21 | time = article.xpath('div/div[1]/div/span/@data-shared-at').extract() 22 | print('--------------------------------------------------------') 23 | print(author) 24 | 25 | item['title'] = title 26 | item['author'] = author 27 | item['time'] = time 28 | 29 | yield item 30 | 31 | -------------------------------------------------------------------------------- /泰迪杯尝试/1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 13:12 4 | # @Author : mazicwong 5 | # @File : 1.py 6 | 7 | import urllib.request 8 | 9 | 10 | def saveFile(data, cnt): 11 | path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt 12 | f = open(path, 'wb') 13 | f.write(data) 14 | f.close() 15 | 16 | 17 | def getHtml(url, cnt): 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 20 | } 21 | # opener = urllib.request.build_opener() 22 | # opener.addheaders = [headers] 23 | # html = opener.open(url).read() 24 | 25 | req = urllib.request.Request(url=url, headers=headers) 26 | response = urllib.request.urlopen(req, timeout=2) 27 | html = response.read() 28 | # print(html) 29 | saveFile(html, cnt) 30 | 31 | 32 | def getUrl(): 33 | file = open(r"E:\泰迪杯\C题样例数据\All_html\url.txt", "r") 34 | urlList = file.readlines() 35 | cnt = 1 36 | for url in urlList: 37 | getHtml(url, cnt) 38 | cnt += 1 39 | 40 | 41 | def main(): 42 | getUrl() 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /机器学习入门/监督/回归/房价预测.py: -------------------------------------------------------------------------------- 1 | # 数据集: 面积,价格; 进行回归 2 | # 已知面积,预测房屋价格 3 | 4 | import matplotlib.pyplot as plt 5 | from sklearn import linear_model 6 | import numpy as np 7 | 8 | 9 | # 数据预处理 10 | data_x = [] 11 | data_y = [] 12 | f = open('prices.txt','r') 13 | lines = f.readlines() 14 | for line in lines: 15 | items = line.strip().split(',') 16 | # print ("%d %d " % (int(items[0]),int(items[1]))) 17 | data_x.append(int(items[0])) 18 | data_y.append(int(items[1])) 19 | # plt.scatter(data_x,data_y,c='r') 20 | # plt.plot(x, linear.predict(x), c='b') 21 | # plt.xlabel('Area') 22 | # plt.ylabel('Price') 23 | # plt.show() 24 | 25 | length = len(data_x) 26 | data_x = np.array(data_x).reshape([length,1]) # 转化为二维数组(回归函数参数需要) 27 | data_y = np.array(data_y) 28 | minx = min(data_x) 29 | maxx = max(data_x) 30 | print(minx , ' ', maxx) 31 | x = np.arange(minx,maxx).reshape([-1,1]) # 等差数列 32 | 33 | # 训练 34 | linear = linear_model.LinearRegression() 35 | linear.fit(data_x, data_y) 36 | 37 | # 回归方程系数,截距 38 | print('Coefficient:', linear.coef_, '; intercept:', linear.intercept_) 39 | 40 | plt.scatter(data_x,data_y,c='r') 41 | plt.plot(x, linear.predict(x), c='b') 42 | plt.xlabel('Area') 43 | plt.ylabel('Price') 44 | plt.show() -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import os 9 | import json 10 | import codecs 11 | 12 | class CrawlPipeline(object): 13 | def __init__(self): 14 | self.current_dir = os.getcwd() 15 | 16 | def process_item(self, item, spider): 17 | dir_path = self.current_dir + '/docs/' + item['source'] + '/' + item['date'] 18 | print(dir_path) 19 | if not os.path.exists(dir_path): 20 | os.makedirs(dir_path) 21 | 22 | news_file_path = dir_path + '/' + item['newsId'] + '.json' 23 | if os.path.exists(news_file_path) and os.path.isfile(news_file_path): 24 | print("*****************************") 25 | print(item['newsId'] + '.json exists, just skip') 26 | print("*****************************") 27 | 28 | news_file = codecs.open(news_file_path, 'w', 'utf-8') 29 | line = json.dumps(dict(item)) 30 | news_file.write(line) 31 | news_file.close() 32 | return item 33 | -------------------------------------------------------------------------------- /机器学习入门/matplotlib使用.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 10:55 4 | # @Author : mazicwong 5 | # @File : matplotlib使用.py 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | N = 5 11 | menMeans = (20, 35, 30, 35, 27) 12 | menStd = (2, 3, 4, 1, 2) 13 | 14 | ind = np.arange(N) # the x locations for the groups 15 | width = 0.35 # the width of the bars 16 | 17 | fig, ax = plt.subplots() 18 | rects1 = ax.bar(ind, menMeans, width, color='r', yerr=menStd) 19 | 20 | womenMeans = (25, 32, 34, 20, 25) 21 | womenStd = (3, 5, 2, 3, 3) 22 | rects2 = ax.bar(ind + width, womenMeans, width, color='y', yerr=womenStd) 23 | 24 | # add some 25 | ax.set_ylabel('Scores') 26 | ax.set_title('Scores by group and gender') 27 | ax.set_xticks(ind + width) 28 | ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5')) 29 | 30 | ax.legend((rects1[0], rects2[0]), ('Men', 'Women')) 31 | 32 | 33 | def autolabel(rects): 34 | # attach some text labels 35 | for rect in rects: 36 | height = rect.get_height() 37 | ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, '%d' % int(height), 38 | ha='center', va='bottom') 39 | 40 | 41 | autolabel(rects1) 42 | autolabel(rects2) 43 | 44 | plt.show() -------------------------------------------------------------------------------- /泰迪杯尝试/数据爬取(未处理).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 12:44 4 | # @Author : mazicwong 5 | # @File : 数据爬取(未处理).py 6 | 7 | import urllib.request 8 | 9 | 10 | # 按顺序放入txt 11 | def saveFile(data, cnt): 12 | path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt 13 | f = open(path, 'wb') 14 | f.write(data) 15 | f.close() 16 | 17 | 18 | # 保存爬取不了的网页下来分析 19 | def saveFail(url, cnt): 20 | path = r'E:\泰迪杯\C题样例数据\All_html\fail.txt' 21 | f = open(path, 'ab+') 22 | f.write(cnt + ' ' + url) 23 | f.close() 24 | 25 | 26 | def getHtml(url, cnt): 27 | headers = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 29 | } 30 | req = urllib.request.Request(url=url, headers=headers) 31 | try: 32 | response = urllib.request.urlopen(req, timeout=2) 33 | html = response.read() 34 | print('第%s个论坛爬取成功' % cnt) 35 | saveFile(html, cnt) 36 | except: 37 | print('sorry! 第%s个论坛爬取失败' % cnt) 38 | saveFail(url, cnt) 39 | 40 | 41 | def getUrl(): 42 | file = open(r"E:\泰迪杯\C题样例数据\All_html\url.txt", "r") 43 | urlList = file.readlines() 44 | cnt = 1 45 | for url in urlList: 46 | getHtml(url, cnt) 47 | cnt += 1 48 | 49 | 50 | def main(): 51 | getUrl() 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /2048/2048.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding:utf-8 _*_ 3 | #教程https://www.shiyanlou.com/courses/running 4 | import curses 5 | from random import randrange,choice 6 | from collections import defaultdict 7 | 8 | 9 | ##用户行为 10 | actions = ['Up','Left','Down','Right','Restart','Exit'] 11 | #考虑到大写开启,要获得有效键值列表 12 | letter_codes=[ord(ch) for ch in 'WASDRQwasdrq'] 13 | #输入与行为进行关联 14 | actions_dict=dict(zip(letter_codes,actions*2)) 15 | 16 | ##状态机 17 | 18 | 19 | 20 | def main(strscr): 21 | def init(): 22 | #init the game 23 | return 'Game' 24 | def not_game(state): 25 | #wirte down the iterface of GAMEOVER/WIN 26 | #get what user's input,judge restart a game or close it 27 | responses=defaultdict(lambda:state) 28 | responses['Restart'],responses['Exit']='Init','Exit' 29 | return responses[action] 30 | def game(): 31 | #wirte down the chess tatus 32 | #get the user's input about 'action' 33 | if action=='Restart': 34 | return 'Init' 35 | if action=='Exit': 36 | return 'Exit' 37 | #if 成功移动一步 38 | if ying: 39 | return 'Win' 40 | if shibai: 41 | return 'Gameover' 42 | return 'Game' 43 | state_actions={ 44 | 'Init':init, 45 | 'Win':lambda:not_game('Win') 46 | 'Gamevoer':lambda:not_game('Gameover') 47 | 'Game':game 48 | } 49 | state='Init' 50 | while state != 'Exit': 51 | state=state_actions[state]() 52 | -------------------------------------------------------------------------------- /机器学习入门/Numpy.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import numpy as np 3 | 4 | ### 引用mnist数据 5 | from keras.datasets import mnist 6 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 7 | X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2]) 8 | X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2]) 9 | Y_train = (numpy.arange(10) == y_train[:, None]).astype(int) # 把index转换为一个one hot的矩阵 10 | Y_test = (numpy.arange(10) == y_test[:, None]).astype(int) # Y_test.shape 11 | 12 | ### reshape函数 13 | a = np.array([1,2,3]) 14 | print(a.shape) #(3,) 15 | a = a.reshape((1,-1)) # (1,3) 16 | print(a.shape) #(1,3) 1*3矩阵 17 | 18 | a = np.array([1,2,3,4,5,6]) 19 | print(a.shape) 20 | a = a.reshape((2,-1)) 21 | print(a.shape) #(2,3) 2*3矩阵(二维数组) 22 | 23 | ### full 24 | a = np.full((3,3),0) 25 | 26 | ### eye 27 | a = np.eye(3) #单位矩阵 28 | 29 | ### random.random 30 | a = np.random.random((3,4)) 31 | 32 | ### indexing 33 | a = np.array([[1,2,3,4], 34 | [5,6,7,8], 35 | [9,10,11,12]]) 36 | a[-2:, 1:3] #array[[6,7][10,11]] 37 | 38 | ### arange 39 | np.arange(3,7) 40 | 41 | # 数学运算 42 | a = np.array([[1,2], 43 | [3,4]]) 44 | b = np.array([[5,6], 45 | [7,8]]) 46 | a+b # np.add(a,b) 47 | a*b #对应元素相乘 48 | a.dot(b) # 真正的矩阵乘法 49 | np.dot(a,b) 50 | 51 | # 常用函数 52 | np.sum(a) # 所有元素求和 53 | np.sum(a,axis=0) # 每一列求和 54 | np.sum(a,axis=1) # 每一行求和 55 | 56 | np.mean(a) # 元素和的均值 57 | np.mean(a,axis=0) # 每一列的均值 58 | 59 | np.random.uniform(3,4) # 产生[3,4]随机小数 60 | 61 | a.T #矩阵转置 -------------------------------------------------------------------------------- /泰迪杯尝试/爬取相似URL/3.所有小URL初步信息去标签.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/4/2 0:56 4 | # @Author : mazicwong 5 | # @File : 3.所有小URL初步信息去标签.py 6 | 7 | import re 8 | import os 9 | 10 | for i in range(0, 180): #180个大URL 11 | if os.path.exists(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % i): #已经有爬取结果的 12 | for cnt in (0,30): #对爬取好的相似URL选取不大于30个html代码来去标签 13 | if os.path.isfile(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt" % (i, cnt)): 14 | if not os.path.exists("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s" % cnt): # 创建一个文件夹 15 | os.makedirs("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s" % cnt) 16 | with open(r'E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt' % (i, cnt), 'r') as file: 17 | str = file.read() 18 | str = '\n'.join(re.split(' +', str)) 19 | str = '\n'.join(re.split('\t+', str)) 20 | str = '\n'.join(re.split('\r+', str)) 21 | str = '\n'.join(re.split(' ', str)) 22 | str = '\n'.join(re.split('\n+', str)) 23 | print(str) 24 | with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s\%s去标签.txt" % (i, cnt),"wb") as file1: # 一般用双引号,单引号会出问题 25 | file1.write(str) 26 | # with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\去标签后的\%s\%s_去标签.txt" % (i, cnt), "w") as file1: 27 | # file1.write(str) 28 | 29 | -------------------------------------------------------------------------------- /泰迪杯尝试/爬取相似URL/2.从相似URL中下载内容.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/4/2 0:09 4 | # @Author : mazicwong 5 | # @File : 2.从相似URL中下载内容.py 6 | 7 | import os 8 | import urllib.request 9 | 10 | 11 | def getHtml(url): 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 14 | } 15 | try: 16 | req = urllib.request.Request(url=url, headers=headers) 17 | response = urllib.request.urlopen(req, timeout=2) 18 | html = response.read() 19 | return html 20 | except: 21 | print("there must be somthing wrong when crawing") 22 | 23 | 24 | def main(): 25 | for cnt in range(1, 171): 26 | with open("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\out%s.txt" % cnt, "r") as file: 27 | List = file.readlines() 28 | if len(List) != 0: 29 | if not os.path.exists("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % cnt): # 创建一个文件夹 30 | os.makedirs("E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s" % cnt) 31 | for i in range(0, len(List)): 32 | if i > 20: 33 | break 34 | with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\%s\%s.txt" % (cnt, i), "wb") as f: 35 | f.write(getHtml(List[i])) 36 | print("第%s个小的url处理成功" % i) 37 | print("第%s个URL处理成功" % cnt) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/spiders/xzhc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import scrapy 5 | from scrapy.selector import Selector 6 | from jnuxshc.items import JnuxshcItem 7 | from scrapy.http import Request 8 | 9 | 10 | class xzhc(scrapy.Spider): 11 | name = 'xzhc' #最后要调用的名字 12 | start_urls=['https://news.jnu.edu.cn/xshc/ll/'] 13 | url = 'https://jnu.edu.cn' 14 | 15 | def parse(self, response): # response即网页数据 16 | item = JnuxshcItem() 17 | selector = Selector(response) 18 | articles = selector.xpath('//*[@id="content"]/div[1]/ul/li') 19 | print("huangzhiqihuangzhiqi-----") 20 | 21 | for article in articles: 22 | #if article.xpath('@class/text()').extract() 23 | title = article.xpath('div[2]/div[1]/a/text()').extract() 24 | time = article.xpath('div[2]/div[3]/text()').extract() 25 | intro = article.xpath('div[2]/div[2]/text()').extract() 26 | print('--------------------------------------------------------') 27 | print(title) 28 | 29 | item['title'] = title 30 | item['time'] = time 31 | item['intro'] = intro 32 | 33 | yield item 34 | 35 | #因为有很多页,所以要递归调用 36 | tmp_url = 'https://news.jnu.edu.cn/' 37 | next_link = selector.xpath('//*[@class="pager"]/a[@class="next"]/@href').extract() 38 | if next_link[0] != '/xshc/ll/List_1.html': 39 | next_link = tmp_url+next_link[0] 40 | yield Request(next_link,callback=self.parse) #回调函数为self.parse 41 | 42 | -------------------------------------------------------------------------------- /泰迪杯尝试/README.md: -------------------------------------------------------------------------------- 1 | # explanation about 2 | ### in Python 3.5 3 | - **数据爬取 未去标签** 4 | - saveFile 5 | - saveFail 6 | - getHtml 7 | - getUrl 8 | 9 | - **数据爬取 去除标签** 10 | - replaceCharEntity 11 | - repalce 12 | - saveFile 13 | - get_localfile 14 | 15 | - **爬取相邻url用于去重** 16 | - 考虑添加功能=>判断html总长与原来文本进行对比,避免爬到死链 17 | - 长度相差大于70%? 18 | - getHtml 19 | - RETURN True/False AND url_data 20 | - getSimilarHtml 21 | - FIND the root_url AND get other url among it AND compare it with the previous one 22 | 23 | 24 | Get the similar URL 25 | - 如何主页爬取到相似URL? 26 | 1. 爬取主页所有url,然后进行遍历,用随机数(may be it can accelerate the proceed..who knows..) 27 | 2. DFS遍历,但是最多深入到三层 28 | 3. 判断方法:在当前url对html进行匹配,看看有没有最初的url, 29 | 有的话就找到标签,然后用bs4的find("",xx.next_siblings)找到兄弟标签, 30 | 接着获取url进行判断,就用正则匹配下是否两个url只有数字不同 31 | 32 | - A new method? 33 | 1. try guessing the regular expression of the existing URL, 34 | and then get the root_html from the root_url,so that I can match what I want, 35 | which means its format are familiar with the exist one, 36 | from the html source I have already had. 37 | - 最终实现方式 38 | 1. 根据已有的URL获得主页的html 39 | 2. 然后由URL推导出相同格式的正则表达式 40 | 3. 在主页的html中匹配我的正则表达式,获得相似URL 41 | - 几个坑 42 | 1. 反向推导正则的时候,因为最终是得到string类型的pattern, 43 | 所以要用p1 = p1.encode(encoding="utf-8")转换为bytes类型, 44 | 2. 在推导正则时,如果用p1=r'http://www.baidu.com/\d\d[a-z]', 45 | 接下来在做编码的时候,\d会变成\\d,且由于加了r取消掉转义字符, 46 | 会导致匹配结果错误,还有一点就是最后有一个换行,用str=str[:-1]删掉,以后应该注意 47 | 3. 判断字符串的每个字符,不能用isalpha和isnum,因为全都是字符 48 | 4. 添加功能:已经存在且不为0的文档就不重复爬取 -------------------------------------------------------------------------------- /泰迪杯尝试/pyquery取全体文本.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 16:38 4 | # @Author : mazicwong 5 | # @File : pyquery取全体文本.py 6 | 7 | from pyquery import PyQuery 8 | import urllib.request 9 | 10 | 11 | # 按顺序放入txt 12 | def saveFile(data, cnt): 13 | path = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt 14 | f = open(path, 'wb') 15 | f.write(data) 16 | f.close() 17 | # 上面三句也可以写成 18 | # with open(path,'wb') as f: 19 | # f.write(data) 20 | 21 | 22 | # 保存爬取不了的网页下来分析 23 | def saveFail(url, cnt): 24 | path = r'E:\泰迪杯\C题样例数据\All_html 去标签\fail.txt' 25 | f = open(path, 'ab+') 26 | f.write('%s %s' % cnt % url) 27 | f.close() 28 | 29 | 30 | def getHtml(url, cnt): 31 | headers = { 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 33 | } 34 | req = urllib.request.Request(url=url, headers=headers) 35 | try: 36 | response = urllib.request.urlopen(req, timeout=2) 37 | html = response.read() 38 | doc = PyQuery('
tototata
') # 去标签 39 | print(doc.text()) 40 | print('第%s个论坛爬取成功' % cnt) 41 | saveFile(doc, cnt) 42 | except: 43 | print('sorry! 第%s个论坛爬取失败' % cnt) 44 | saveFail(url, cnt) 45 | 46 | 47 | def getUrl(): 48 | file = open(r"E:\泰迪杯\C题样例数据\All_html 去标签\url.txt", "r") 49 | urlList = file.readlines() 50 | cnt = 1 51 | for url in urlList: 52 | getHtml(url, cnt) 53 | cnt += 1 54 | 55 | 56 | def main(): 57 | getUrl() 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20160602/BOIMS8PF00014JB5.json: -------------------------------------------------------------------------------- 1 | {"newsId": "BOIMS8PF00014JB5", "date": "20160602", "source": "netease", "comments": {"link": "http://comment.news.163.com/news3_bbs/BOIMS8PF00014JB5.html"}, "contents": {"title": ["\u7f51\u6613\u65b0\u95fb\u9891\u9053\u9996\u9875\u6539\u7248\u610f\u89c1\u53cd\u9988"], "link": "http://news.163.com/16/0602/16/BOIMS8PF00014JB5.html", "passage": "

\u5404\u4f4d\u4eb2\u7231\u7684\u8bfb\u8005\u76c6\u53cb\u4eec\uff0c\u7f51\u6613\u65b0\u95fb\u9996\u9875\u65b0\u7248\u4e8e7\u67081\u65e5\u4e0a\u7ebf\u3002\u8fd9\u6b21\u6539\u7248\u5168\u9762\u5bf9\u63a5\u79fb\u52a8\u7aef\uff0c\u4e3a\u6ee1\u8db3\u7f51\u53cb\u7684\u9605\u8bfb\u4e60\u60ef\u548c\u9700\u6c42\uff0c\u65b0\u7248\u9875\u9762\u4e0e\u79fb\u52a8\u7aef\u4fdd\u6301\u4e00\u81f4\uff0c\u4f7f\u7528\u6237\r\n\u5728\u6d4f\u89c8PC\u7aef\u9875\u9762\u65f6\uff0c\u4e5f\u80fd\u50cf\u9605\u8bfb\u79fb\u52a8\u7aef\u65b0\u95fb\u4e00\u822c\u4fbf\u6377\u9ad8\u6548\u3002\u540c\u65f6\uff0c\u6211\u4eec\u6269\u5927\u9605\u8bfb\u754c\u9762\uff0c\u4f7f\u5f97\u5927\u5c4f\u5e55\u7684\u7535\u8111\u6709\u66f4\u5bbd\u5e7f\u7684\u53ef\u89c6\u7a7a\u95f4\uff0c\u65b9\u4fbf\u5927\u5bb6\u63a5\u6536\u66f4\u591a\u7684\u4fe1\u606f\u3002\u5404\u7c7b\u7b56\r\n\u5212\u90fd\u5f52\u4e8e\u5de6\u8fb9\u680f\uff0c\u65b9\u4fbf\u5927\u5bb6\u9605\u8bfb\u7f51\u6613\u72ec\u5bb6\u539f\u521b\u3002

\n

\u5f53\u7136\uff0c\u8fd9\u53ea\u662f\u5c0f\u7f16\u4eec\u7684\u60f3\u6cd5\uff0c\u9886\u5bfc\u8bf4\u4e86\u8fd8\u8981\u95ee\u95ee\u4f60\u4eec\u600e\u4e48\u770b\u3002\u6240\u4ee5\u6211\u4eec\u5c31\u51fa\u4e86\u4e00\u4e9b\u95ee\u9898\u8ba9\u4f60\u4eec\u56de\u7b54\u3002\u8fd8\u6709\u5176\u4ed6\u60f3\u6cd5\uff0c\u6b22\u8fce\u5728\u8ddf\u8d34\u91cc\u63d0\u51fa\u54e6\uff01

"}, "cmtId": "BOIMS8PF00014JB5"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/012771.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "012771", "comments": {"link": "http://coral.qq.com/1687685805"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012771.htm", "title": ["\u7f8e\u56fd\u597d\u5fc3\u4eba\u533f\u540d\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u63501.5\u5428\u725b\u6392"], "passage": "\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u897f\u96c5\u56fe\u4e00\u4e2a\u6148\u5584\u56e2\u4f53\u8054\u4eca\u5e74\u5723\u8bde\u8282\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u51c6\u5907\u7684\u83dc\u8272\u683c\u5916\u4e30\u5bcc\uff0c\u56e0\u4e3a\u4e00\u4f4d\u533f\u540d\u5584\u5fc3\u4eba\u58eb\u6350\u4e863500\u78c5(\u7ea61589\u516c\u65a4)\u7684\u808b\u773c\u725b\u6392\uff0c\u4e3a\u65e0\u5bb6\u53ef\u5f52\u7684\u6c11\u4f17\u8d34\u5fc3\u52a0\u83dc\u3002\u636e\u62a5\u9053\uff0c\u8be5\u56e2\u4f53\u4e3b\u53a8\u8d39\u96ea(Jordan Fisher)\u63a5\u53d7\u5a92\u4f53\u8bbf\u95ee\u65f6\u8868\u793a\uff0c\u4eca\u5e74\u5723\u8bde\u8282\u6536\u5230\u6709\u4eba\u6350\u8d60\u4e86\u9ad8\u8fbe3500\u78c5\u7684\u808b\u773c\u725b\u6392(rib-eye steak)\uff0c\u201c\u6211\u77e5\u9053\u7684\u65f6\u5019\uff0c\u5413\u4e86\u4e00\u5927\u8df3\u3002\u201d\u8d39\u96ea\u8bf4\uff1a\u201c\u8fd9\u662f\u5f88\u96be\u5f97\u7684\u4e8b\u3002\u50cf\u6211\u4eec\u8fd9\u6837\u7684\u673a\u6784\uff0c\u5e76\u4e0d\u4f1a\u5e38\u5e38\u78b0\u5230\u8fd9\u6837\u7684\u72b6\u51b5\u3002\u201d\u5728\u4eca\u5e74\u5723\u8bde\u8282\u5f53\u5929\uff0c\u524d\u5f80\u897f\u96c5\u56fe\u8be5\u6148\u5584\u56e2\u4f53\u6240\u5c5e\u6551\u6d4e\u7ad9\u5403\u996d\u7684\u6e38\u6c11\uff0c\u4e0d\u7ba1\u5927\u4eba\u6216\u5c0f\u5b69\uff0c\u6bcf\u4e2a\u4eba\u90fd\u5403\u5230\u4e86\u4e00\u4efd\u808b\u773c\u725b\u6392\u3002\u76f8\u5173\u4eba\u58eb\u8868\u793a\uff0c\u6350\u8d60\u725b\u6392\u7684\u597d\u5fc3\u4eba\u8981\u6c42\u533f\u540d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HD3PFD0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HD3PFD0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HD3PFD0001875P.html"}, "newsId": "D8HD3PFD0001875P", "contents": {"passage": "

\"\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054

\"\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054

\n

\u3010\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb\uff1a\u903c\u6b7b\u6587\u79d1\u751f\u3011\u8fd1\u65e5\uff0c@\u5357\u4eac\u6797\u4e1a\u5927\u5b66 \u7684\u5b66\u751f\u5bbf\u820d\u95e8\u53e3\u8d34\u4e86\u4e00\u526f\u7279\u522b\u7684\u5bf9\u8054\uff0c\u5185\u5bb9\u7528\u5316\u5b66\u5143\u7d20\u5468\u671f\u8868\u91cc\u7684\u5143\u7d20\u7b26\u53f7\u62fc\u6210\uff0c\u7f51\u53cb\u76f4\u547c\u770b\u4e0d\u61c2\uff01\u636e\u6089\uff0c\u521b\u4f5c\u5bf9\u8054\u7684\u662f\u8be5\u6821\u751f\u7269\u4e0e\u73af\u5883\u5b66\u9662\u7684\u5927\u4e00\u5b66\u751f\u535e\u6b63\uff0c\u5bf9\u8054\u521b\u610f\u662f\u4ed6\u548c\u9ad8\u4e2d\u540c\u5b66\u4eec\u60f3\u51fa\u6765\u7684\u3002\u53ea\u770b\u56fe1\uff0c\u4f60\u80fd\u731c\u51fa\u662f\u4ec0\u4e48\u5417\uff1f

", "link": "http://news.163.com/18/0119/16/D8HD3PFD0001875P.html", "title": ["\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb:\u903c\u6b7b\u6587\u79d1\u751f"]}} -------------------------------------------------------------------------------- /机器学习入门/无监督/cluster/city.txt: -------------------------------------------------------------------------------- 1 | 北京 2959.19 730.79 749.41 513.34 467.87 1141.82 478.42 457.64 2 | 天津 2459.77 495.47 697.33 302.87 284.19 735.97 570.84 305.08 3 | 河北 1495.63 515.90 362.37 285.32 272.95 540.58 364.91 188.63 4 | 山西 1406.33 477.77 290.15 208.57 201.50 414.72 281.84 212.10 5 | 内蒙古 1303.97 524.29 254.83 192.17 249.81 463.09 287.87 192.96 6 | 辽宁 1730.84 553.90 246.91 279.81 239.18 445.20 330.24 163.86 7 | 吉林 1561.86 492.42 200.49 218.36 220.69 459.62 360.48 147.76 8 | 黑龙江 1410.11 510.71 211.88 277.11 224.65 376.82 317.61 152.85 9 | 上海 3712.31 550.74 893.37 346.93 527.00 1034.98 720.33 462.03 10 | 江苏 2207.58 449.37 572.40 211.92 302.09 585.23 429.77 252.54 11 | 浙江 2629.16 557.32 689.73 435.69 514.66 795.87 575.76 323.36 12 | 安徽 1844.78 430.29 271.28 126.33 250.56 513.18 314.00 151.39 13 | 福建 2709.46 428.11 334.12 160.77 405.14 461.67 535.13 232.29 14 | 江西 1563.78 303.65 233.81 107.90 209.70 393.99 509.39 160.12 15 | 山东 1675.75 613.32 550.71 219.79 272.59 599.43 371.62 211.84 16 | 河南 1427.65 431.79 288.55 208.14 217.00 337.76 421.31 165.32 17 | 湖北 1783.43 511.88 282.84 201.01 237.60 617.74 523.52 182.52 18 | 湖南 1942.23 512.27 401.39 206.06 321.29 697.22 492.60 226.45 19 | 广东 3055.17 353.23 564.56 356.27 811.88 873.06 1082.82 420.81 20 | 广西 2033.87 300.82 338.65 157.78 329.06 621.74 587.02 218.27 21 | 海南 2057.86 186.44 202.72 171.79 329.65 477.17 312.93 279.19 22 | 重庆 2303.29 589.99 516.21 236.55 403.92 730.05 438.41 225.80 23 | 四川 1974.28 507.76 344.79 203.21 240.24 575.10 430.36 223.46 24 | 贵州 1673.82 437.75 461.61 153.32 254.66 445.59 346.11 191.48 25 | 云南 2194.25 537.01 369.07 249.54 290.84 561.91 407.70 330.95 26 | 西藏 2646.61 839.70 204.44 209.11 379.30 371.04 269.59 389.33 27 | 陕西 1472.95 390.89 447.95 259.51 230.61 490.90 469.10 191.34 28 | 甘肃 1525.57 472.98 328.90 219.86 206.65 449.69 249.66 228.19 29 | 青海 1654.69 437.77 258.78 303.00 244.93 479.53 288.56 236.51 30 | 宁夏 1375.46 480.89 273.84 317.32 251.08 424.75 228.73 195.93 31 | 新疆 1608.82 536.05 432.46 235.82 250.28 541.30 344.85 214.40 -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HLN6QA0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HLN6QA0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HLN6QA0001875P.html"}, "newsId": "D8HLN6QA0001875P", "contents": {"passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff09\n

\"\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b\u4e8b\u6545\u73b0\u573a

\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670819\u65e5\u6d88\u606f\uff0c\u4eca\u5929\u4e2d\u534812\u70b942\u5206\u5de6\u53f3\uff0c\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u6c5f\u5357\u6d88\u9632\u5927\u961f\u91d1\u78d0\u8def\u6d88\u9632\u4e2d\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff1a\u91d1\u534e\u5e02\u91d1\u4e1c\u533a\u591a\u6e56\u6c40\u6751\u6709\u623f\u5c4b\u53d1\u751f\u5012\u584c\u3002\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u7acb\u5373\u6307\u6d3e6\u8f6630\u4f4d\u6d88\u9632\u5b98\u5175\u8d76\u5f80\u73b0\u573a\u6551\u63f4\u3002\u521d\u6b65\u4f30\u8ba1\u516b\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff0c6\u4eba\u88ab\u9001\u5f80\u533b\u9662\u6551\u6cbb\u3002\u622a\u81f3\u76ee\u524d\uff0c\u6551\u63f4\u4ecd\u5728\u8fdb\u884c\u4e2d\u3002

\n

", "link": "http://news.163.com/18/0119/19/D8HLN6QA0001875P.html", "title": ["\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b 2\u4eba\u5f53\u573a\u6b7b\u4ea1"]}} -------------------------------------------------------------------------------- /ACM/cf/675E DP+greedy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:35 4 | # @Author : mazicwong 5 | # @File : 675E DP+greedy.py 6 | 7 | ''' 8 | 英文: buy only tickets to stations from i+1 to ai inclusive (inclusive 表示包含在这个路段内的) 9 | 10 | 题意:有一个一条直线的地铁线路。给出a数组,在每个站点i只能买到去往[i+1, a[i]]内的票。 11 | 设p(i,j)为从i到j所需要的最少票数,求对所有ij的p(i,j)的和。(1== n): 57 | dp[i] = n - i 58 | else: 59 | x = argmax(que, a[i]) 60 | dp[i] = x - i + dp[x] + n - a[i] 61 | while (len(que) > 0 and que[-1]['a'] < a[i]): 62 | que.pop() 63 | que.append({'i': i, 'a': a[i]}) 64 | return sum(dp) 65 | 66 | 67 | n = int(input()) 68 | a = map(int, input().split(' ')) 69 | print(solve(n, a)) 70 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/014055.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "014055", "comments": {"link": "http://coral.qq.com/1687716811"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/014055.htm", "title": ["\u6cb3\u5357\u5c0f\u4f19\u627f\u5305\u5343\u8f86\u51fa\u79df\u8f66\u9876\u706f \u6253\u51fa\u6211\u7231\u4f60\u8868\u767d"], "passage": "\u8fd9\u8f86\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d12\u670825\u65e5\uff0c\u662f\u897f\u65b9\u7684\u5723\u8bde\u8282\uff0c\u8bb8\u591a\u60c5\u4fa3\uff0c\u9009\u62e9\u5728\u8fd9\u4e00\u5929\u8868\u767d\u3002\u5f53\u5929\uff0c\u8bb0\u8005\u7684\u670b\u53cb\u5708\u88ab\u8fd9\u6837\u7684\u7167\u7247\u5237\u5c4f\u4e86\uff0c\u5185\u5bb9\u4e3a\u201c\u90ed\u00d7\u00d7\u6211\u7231\u4f60\u201d\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d\u7684\u8868\u767d\uff0c\u5728\u4fe1\u9633\u7684\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u3002\u8fd9\u5219\u201c\u571f\u8c6a\u5f0f\u201d\u7684\u8868\u767d\uff0c\u5f15\u8d77\u4e0d\u5c11\u8fc7\u5f80\u8def\u4eba\u7684\u5173\u6ce8\uff0c\u4e0d\u5c11\u7f51\u53cb\u8868\u793a\u770b\u5230\u4e86\u8fd9\u5219\u8868\u767d\u3002\u7f51\u53cb\u7eb7\u7eb7\u8bc4\u8bba\uff1a\u201c\u8c01\u8fd9\u4e48\u571f\u8c6a\uff0c\u627f\u5305\u4e86\u51fa\u79df\u8f66\u9876\u706f\uff1f\u201d\u201c\u8fd9\u4f4d\u53eb\u90ed\u00d7\u00d7\u7684\u59b9\u5b50\u4e5f\u592a\u5e78\u798f\u4e86\u5427\u3002\u201d\u8fd9\u4e2a\u5c0f\u4f19\u7684\u8868\u767d\u4e5f\u5f97\u5230\u7f51\u53cb\u4e00\u81f4\u795d\u798f\u3002\u6628\u65e5\u4e0b\u5348\uff0c\u8bb0\u8005\u4e86\u89e3\u5230\uff0c\u4fe1\u9633\u5e02\u51fa\u79df\u8f66\u4e0a\u7684\u9876\u706f\u5c4f\u5e7f\u544a\u90fd\u662f\u7531\u4fe1\u9633\u67d0\u5bb6\u5e7f\u544a\u516c\u53f8\u7edf\u4e00\u8fd0\u8425\uff0c\u8fd9\u4f4d\u5c0f\u4f19\u4e00\u5171\u5305\u4e861000\u591a\u8f86\u51fa\u79df\u8f66\uff0c\u4ef7\u683c\u4e0a\u5343\u5143\u3002\u201c\u5728\u516c\u53f8\u5e72\u4e86\u8fd9\u4e48\u4e45\uff0c\u7b2c\u4e00\u6b21\u89c1\u8fd9\u6837\u7684\u4e8b\u60c5\uff0c\u8fd9\u5c0f\u4f19\u7684\u60f3\u6cd5\u592a\u65b0\u9896\u4e86\uff0c\u628a\u72d7\u7cae\u6492\u904d\u4e86\u5168\u57ce\u5440\u3002\u201d\u4e00\u540d\u7684\u54e5\u544a\u8bc9\u8bb0\u8005\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/006763.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "006763", "comments": {"link": "http://coral.qq.com/2369396685"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006763.htm", "title": ["\u6c55\u5934\u8b66\u65b9\uff1a\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u603b\u7ecf\u7406\u521d\u67e5\u4e3a\u610f\u5916\u5760\u4ea1 \u5c06\u8ffd\u8d23\u9020\u8c23\u8005"], "passage": "\u6c55\u5934\u5e02\u516c\u5b89\u5c40\u6f84\u6d77\u5206\u5c40\u5fae\u4fe1\u516c\u53f7\u201c\u5e73\u5b89\u6f84\u6d77\u201d2018\u5e741\u670818\u65e5\u6d88\u606f\uff1a1\u670818\u65e5\u51cc\u6668\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u63a5\u5e7f\u4e1c\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u6709\u9650\u516c\u53f8\u7ba1\u7406\u4eba\u5458\u6797\u5fb7\u8d35\u62a5\u79f0\uff1a\u5176\u516c\u53f8\u603b\u7ecf\u7406\u9648\u4e50\u5f3a\u4e8e2018\u5e741\u67088\u65e5\u4e0d\u5e78\u901d\u4e16\uff0c\u8fd1\u671f\u7f51\u7edc\u4e0a\u51fa\u73b0\u5bf9\u9648\u4e50\u5f3a\u6b7b\u56e0\u6076\u610f\u4e2d\u4f24\u7684\u5fae\u535a\u548c\u89c6\u9891\u62a5\u9053\uff0c\u5bf9\u9648\u4e50\u5f3a\u7684\u58f0\u8a89\u548c\u516c\u53f8\u6b63\u5e38\u7ecf\u8425\u9020\u6210\u4e0d\u826f\u5f71\u54cd\uff0c\u5e76\u8981\u6c42\u4e25\u60e9\u9020\u8c23\u8005\u3002\u63a5\u62a5\u540e\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u8fc5\u901f\u5f00\u5c55\u8c03\u67e5\u3002\u636e\u9648\u4e50\u5f3a\u5bb6\u5c5e\u53cd\u6620\uff0c\u6839\u636e\u65b0\u52a0\u5761\u8b66\u65b9\u544a\u77e5\u7684\u521d\u6b65\u8c03\u67e5\u7ed3\u679c\uff0c\uff0c\u6b63\u5f0f\u6b7b\u4ea1\u62a5\u544a\u8981\u7b49\u8b66\u65b9\u7ed3\u6848\u540e\uff0c\u62a5\u7ecf\u6cd5\u9662\u88c1\u51b3\u540e\u624d\u6b63\u5f0f\u901a\u77e5\u4e2d\u56fd\u9a7b\u65b0\u52a0\u5761\u5927\u4f7f\u9986\u3002\u9274\u4e8e\u8fd1\u671f\u7f51\u7edc\u5a92\u4f53\u4f20\u64ad\u9648\u4e50\u5f3a\u6b7b\u56e0\u53ca\u5176\u4ed6\u4fe1\u606f\u7684\u60c5\u51b5\uff0c\u8bf7\u5e7f\u5927\u7f51\u6c11\u4e0d\u8981\u4f20\u64ad\u672a\u7ecf\u6838\u5b9e\u7684\u4fe1\u606f\uff0c\u5bf9\u4e8e\u9020\u8c23\u3001\u4f20\u8c23\u6d89\u5acc\u8fdd\u6cd5\u7684\uff0c\u516c\u5b89\u673a\u5173\u5c06\u4f9d\u6cd5\u8ffd\u7a76\u76f8\u5173\u4eba\u5458\u7684\u6cd5\u5f8b\u8d23\u4efb\u3002\u76ee\u524d\uff0c\u6709\u5173\u60c5\u51b5\u6b63\u5728\u8fdb\u4e00\u6b65\u8c03\u67e5\u4e2d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180120/D8J1VDAJ0001875P.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8J1VDAJ0001875P", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8J1VDAJ0001875P.html"}, "contents": {"title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "link": "http://news.163.com/18/0120/07/D8J1VDAJ0001875P.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u5927\u96fe\u9ec4\u8272\u9884\u8b66 \u6c5f\u82cf\u5b89\u5fbd\u6cb3\u5357\u6e56\u5317\u7b49\u5730\u90e8\u5206\u5730\u533a\u6709\u6d53\u96fe\uff09\n

\u4e2d\u56fd\u5929\u6c14\u7f51\u8baf \u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff1a

\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002

\"\u5927\u96fe\u9ec4\u8272\u9884\u8b66

\n

\u9632\u5fa1\u6307\u5357\uff1a

1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168;

2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002

"}, "cmtId": "D8J1VDAJ0001875P"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8H1O67B0001899N.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8H1O67B0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8H1O67B0001899N.html"}, "contents": {"title": ["\u7537\u5b50\u5230\u7a97\u53e3\u5904\u74064\u6761\u7f5a\u5355 \u529e\u4e8b\u5458\u5904\u7406\u4e00\u534a\u8bf4\"\u4e0b\u73ed\u4e86\""], "link": "http://news.163.com/18/0119/13/D8H1O67B0001899N.html", "passage": "

\n\t\n\t

\u3010\u56db\u5f20\u7f5a\u5355\u5904\u7406\u4e24\uff0c\u529e\u4e8b\u5458\uff1a\u201c\u6211\u4e0b\u73ed\u4e86\u201d\u3011\u8fd1\u65e5\uff0c\u8d35\u5dde\u8d35\u9633\u7684\u8bb8\u5e08\u5085\u5230\u8f66\u7ba1\u6240\u529e\u7406\u8fdd\u7ae0\uff0c2\u670d\u52a1\u7a97\u53e3\u53ea\u5f001\u4e2a\u30024\u5c0f\u65f6\u540e\u8f6e\u5230\u4ed6\uff0c4\u6761\u8fdd\u7ae0\u521a\u529e2\u6761\uff0c\u529e\u4e8b\u5458\u8bf4\u201c\u6211\u8981\u4e0b\u73ed\u4e86\u201d\u3002\u5176\u95f4\uff0c\u5173\u95ed\u7684\u53e61\u4e2a\u7a97\u53e3\u5374\u4e3a\u201c\u719f\u4eba\u201d\u529e\u4e1a\u52a1\u3002

\n

"}, "cmtId": "D8H1O67B0001899N"} -------------------------------------------------------------------------------- /机器学习入门/keras/mnist.py: -------------------------------------------------------------------------------- 1 | 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense,Dropout,Activation 4 | from keras.optimizers import SGD 5 | from keras.datasets import mnist 6 | import numpy 7 | import h5py # save model 8 | 9 | ''' 10 | 第一步:选择模型 11 | ''' 12 | model = Sequential() 13 | 14 | ''' 15 | 第二步:构建网络层 16 | ''' 17 | model.add(Dense(500,input_shape=(784,))) # 输入层,28*28=784 (输入维度784,输出500个特征) 18 | model.add(Activation('tanh')) # 激活函数是tanh 19 | model.add(Dropout(0.5)) # 采用50%的dropout 20 | 21 | model.add(Dense(500)) # 隐藏层节点500个 22 | model.add(Activation('tanh')) 23 | model.add(Dropout(0.5)) 24 | 25 | model.add(Dense(10)) # 输出结果是10个类别,所以维度是10 26 | model.add(Activation('softmax')) # 最后一层用softmax作为激活函数 27 | 28 | ''' 29 | 第三步:编译 30 | ''' 31 | sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # 优化函数,设定学习率(lr)等参数 32 | model.compile(loss='categorical_crossentropy', optimizer=sgd) #, class_mode='categorical') # 使用交叉熵作为loss函数 33 | 34 | ''' 35 | 第四步:训练 36 | .fit的一些参数 37 | batch_size:对总的样本数进行分组,每组包含的样本数量 38 | epochs :训练次数 39 | shuffle:是否把数据随机打乱之后再进行训练 40 | validation_split:拿出百分之多少用来做交叉验证 41 | verbose:屏显模式 0:不输出 1:输出进度 2:输出每次的训练结果 42 | ''' 43 | (X_train, y_train), (X_test, y_test) = mnist.load_data() # 使用Keras自带的mnist工具读取数据(第一次需要联网) 44 | # 由于mist的输入数据维度是(num, 28, 28),这里需要把后面的维度直接拼起来变成784维 45 | X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2]) 46 | X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2]) 47 | Y_train = (numpy.arange(10) == y_train[:, None]).astype(int) # 把index转换为一个one hot的矩阵 48 | Y_test = (numpy.arange(10) == y_test[:, None]).astype(int) # Y_test.shape 49 | 50 | model.fit(X_train,Y_train,batch_size=200,epochs=1,shuffle=True,verbose=1,validation_split=0.3) # loss 0.54 -> 0.22 51 | model.evaluate(X_test, Y_test, batch_size=200, verbose=1) 52 | 53 | ''' 54 | 第五步:输出 55 | ''' 56 | print("test set") 57 | scores = model.evaluate(X_test,Y_test,batch_size=200,verbose=0) 58 | print("") 59 | print("The test loss is %f" % scores) 60 | result = model.predict(X_test,batch_size=200,verbose=0) 61 | 62 | result_max = numpy.argmax(result, axis = 1) 63 | test_max = numpy.argmax(Y_test, axis = 1) 64 | 65 | result_bool = numpy.equal(result_max, test_max) 66 | true_num = numpy.sum(result_bool) 67 | print("") 68 | print("The accuracy of the model is %f" % (true_num/len(result_bool))) 69 | 70 | 71 | ''' 72 | 第六步:保存模型(可选) 73 | ''' 74 | # model.save('my_model.h5') 75 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/007056.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "007056", "comments": {"link": "http://coral.qq.com/1687570251"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/007056.htm", "title": ["\u6e56\u5357\u4e00\u5c0f\u5b66\u804c\u5de5\u7325\u4eb5\u5973\u751f \u5973\u5a7f\u7cfb\u8be5\u6821\u6559\u5bfc\u4e3b\u4efb"], "passage": "\u6e56\u5357\u90b5\u9633\u4e00\u5c0f\u5b66\u98df\u5802\u7537\u5b50\u7325\u4eb511\u5c81\u5973\u751f\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c4026\u65e522\u65f6\u8bb8\u53d1\u5e03\u901a\u62a5\u79f0\uff0c12\u670824\u65e5\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c40\u7834\u83b7\u4e00\u8d77\u7325\u4eb5\u513f\u7ae5\u6848\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u88ab\u4f9d\u6cd5\u91c7\u53d6\u5211\u4e8b\u5f3a\u5236\u63aa\u65bd\u3002\u901a\u62a5\u79f0\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u4eca\u5e7464\u5c81\uff0c\u5c0f\u5b66\u6587\u5316\uff0c\u65b0\u90b5\u53bf\u576a\u4e0a\u9547\u4eba\uff0c\u79df\u4f4f\u5728\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u67d0\u5c0f\u5b66\u5916\u67d0\u6c11\u623f\u5185\uff0c\u7cfb\u8be5\u5c0f\u5b66\u52e4\u6742\u5de5\u300212\u670824\u65e5\u4e0b\u5348\uff0c\u8be5\u5206\u5c40\u77f3\u6865\u6d3e\u51fa\u6240\u63a5\u5230\u62a5\u8b66\uff0c\u8f96\u533a\u67d0\u5c0f\u5b66\u5185\u6709\u4eba\u6253\u67b6\u3002\u6c11\u8b66\u8fc5\u901f\u8d76\u5230\u73b0\u573a\uff0c\u5c06\u53cc\u65b9\u5e26\u56de\u516c\u5b89\u673a\u5173\u8fdb\u884c\u8c03\u67e5\u3002\u7ecf\u67e5\uff0c\u5f53\u65e5\u5973\u751f\u5bb6\u957f\u5f97\u77e5\u5f53\u4e8b\u5973\u751f\u88ab\u5218\u67d0\u591a\u6b21\u7325\u4eb5\u540e\uff0c\u4fbf\u6765\u5230\u5b66\u6821\u627e\u5176\u7406\u8bba\uff0c\u53cc\u65b9\u53d1\u751f\u4e89\u6267\uff0c\u5218\u67d0\u906d\u5973\u751f\u5bb6\u5c5e\u6bb4\u6253\u3002\u7ecf\u5ba1\u8baf\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u5bf9\u7325\u4eb5\u8be5\u5973\u751f\u7684\u72af\u7f6a\u4e8b\u5b9e\u4f9b\u8ba4\u4e0d\u8bb3\u3002\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c\u6d89\u6848\u7537\u5b50\u5218\u67d0\u7cfb\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u4f58\u6e56\u5c0f\u5b66\u98df\u5802\u5de5\u4f5c\u4eba\u5458\uff0c\u6d89\u5acc\u7325\u4eb5\u8be5\u6821\u4e00\u540d11\u5c81\u7684\u4e94\u5e74\u7ea7\u5973\u751f\u3002\u5218\u67d0\u5728\u5b66\u6821\u5de5\u4f5c\u4e00\u5e74\u591a\u65f6\u95f4\uff0c\u5176\u5973\u5a7f\u662f\u4f58\u6e56\u5c0f\u5b66\u6559\u5bfc\u5904\u4e3b\u4efb\u3002\u4f58\u6e56\u5c0f\u5b66\u6821\u957f\u5f20\u98de\u8dc312\u670826\u65e5\u4e0b\u5348\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u901a\u62a5\u79f0\uff0c\u76ee\u524d\uff0c\u6848\u4ef6\u6b63\u5728\u8fdb\u4e00\u6b65\u4fa6\u67e5\u4e2d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/002903.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "002903", "comments": {"link": "http://coral.qq.com/2369176633"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/002903.htm", "title": ["\u4eca\u5e74\u6625\u8fd0\u56de\u7a0b\u706b\u8f66\u7968\u9996\u6b21\u6253\u6298 \u90e8\u5206\u56de\u7a0b\u7968\u6700\u4f4e8\u6298"], "passage": "2018\u5e74\u6625\u8fd0\u81ea2\u67081\u53f7\u5f00\u59cb\uff0c3\u670812\u53f7\u7ed3\u675f\uff0c\u517140\u5929\u3002\u4eca\u5e74\u6625\u8fd0\u671f\u95f4\uff0c\u94c1\u8def\u90e8\u95e8\u9996\u6b21\u5bf9\u90e8\u5206\u589e\u5f00\u7684\u5217\u8f66\u56de\u7a0b\u7968\u4ef7\u8bd5\u70b9\u6298\u6263\uff0c\u5728\u73b0\u884c\u7968\u4ef7\u57fa\u7840\u4e0a\u5b9e\u884c8~9\u6298\u4f18\u60e0\u3002\u7531\u4e8e\u6625\u8fd0\u5177\u6709\u5355\u65b9\u5411\u5ba2\u6d41\u7279\u70b9\uff0c\u90e8\u5206\u5217\u8f66\u53bb\u7a0b\u5ba2\u6d41\u96c6\u4e2d\u4f46\u8fd4\u7a0b\u5ba2\u6d41\u8f83\u5c11\u3002\u6b64\u6b21\u56de\u7a0b\u65b9\u5411\u90e8\u5206\u5217\u8f66\u8bd5\u70b9\u7968\u4ef7\u6253\u6298\uff0c\u4e3b\u8981\u56f4\u7ed5\u4eac\u6d25\u3001\u6caa\u676d\u3001\u5e7f\u6df13\u4e2a\u5730\u533a\u59cb\u53d1\u7ec8\u5230\u7684\u5217\u8f66\uff0c\u8282\u524d\u8282\u540e\u5206\u522b\u9009\u53d6\u4e8632\u8d9f\u5217\u8f66\u5b9e\u884c\u6253\u6298\u4f18\u60e0\u30022\u67081\u65e5\u81f32\u670815\u65e5\uff0c\u8282\u524d\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4eac\u6d25\u5730\u533a\u768412\u8d9f\uff1b\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6caa\u676d\u5730\u533a\u768411\u8d9f\uff1b\u6210\u6e1d\u3001\u6e56\u5357\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u5e7f\u6df1\u5730\u533a\u76849\u8d9f\u30022\u670816\u65e5\u81f33\u670812\u65e5\uff0c\u8282\u540e\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4eac\u6d25\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u65b9\u541111\u8d9f\uff1b\u6caa\u676d\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u65b9\u541110\u8d9f\uff1b\u5e7f\u6df1\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6e56\u5357\u3001\u5357\u660c\u7b49\u65b9\u541111\u8d9f\u3002\u94c1\u8def\u90e8\u95e8\u63d0\u793a\uff0c\u65c5\u5ba2\u670b\u53cb\u53ef\u901a\u8fc7\u4e2d\u56fd\u94c1\u8def\u5ba2\u6237\u670d\u52a1\u4e2d\u5fc312306\u7f51\u7ad9\u67e5\u8be2\u5177\u4f53\u6298\u6263\u8f66\u6b21\u76f8\u5173\u4fe1\u606f\uff0c\u5408\u7406\u5b89\u6392\u51fa\u884c\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180120/D8IUD7L60001899O.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8IUD7L60001899O", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8IUD7L60001899O.html"}, "contents": {"title": ["\u7f8e\u53f8\u6cd5\u90e8\u5c06\u4ee5\u6b7b\u5211\u8d77\u8bc9\u7ae0\u83b9\u9896\u6848\u5acc\u72af \u5bb6\u5c5e\u8868\u793a\u6b23\u6170"], "link": "http://news.163.com/18/0120/06/D8IUD7L60001899O.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\uff09\n

\"\u7f8e\u53f8\u6cd5\u90e8\u5c06\u4ee5\u6b7b\u5211\u8d77\u8bc9\u7ae0\u83b9\u9896\u6848\u5acc\u72af

\u3010\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u3011\u7f8e\u56fd\u8054\u90a6\u653f\u5e9c\u4e8e\u5f53\u5730\u65f6\u95f4\u5468\u4e94\u4e0b\u5348\u53d1\u8868\u7531\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u6770\u592b\u00b7\u585e\u7533\u65af\uff08Jeff Sessions\uff09\u7b7e\u7f72\u7684\u6587\u4ef6\uff0c\u51b3\u5b9a\u5bf9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u4e2d\u56fd\u8bbf\u95ee\u5b66\u8005\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u5e03\u5170\u767b\u7279\u514b\u91cc\u65af\u6ed5\u68ee\uff08Brendt Christensen\uff09\u5bfb\u6c42\u6b7b\u5211\u3002\u8fd9\u4efd\u6587\u4ef6\u6307\u51fa\u6839\u636e\u5927\u966a\u5ba1\u56e22017\u5e7410\u67083\u65e5\u5bf9\u514b\u91cc\u65af\u6ed5\u68ee\u63d0\u51fa\u7684\u8ffd\u52a0\u8d77\u8bc9\u4e66\u5185\u5bb9 \u2014 \u5acc\u72af\u6545\u610f\u975e\u6cd5\u631f\u6301\u3001\u7981\u9522\u3001\u8bf1\u9a97\u3001\u7ed1\u67b6\u3001\u52ab\u6301\u7ae0\u83b9\u9896\u5e76\u6700\u7ec8\u5bfc\u81f4\u5176\u6b7b\u4ea1\uff0c \u4ee5\u6b7b\u5211\u8d77\u8bc9\u5acc\u72af\u662f\u5408\u7406\u7684\u3002\u7ae0\u83b9\u9896\u5bb6\u4eba\u7684\u4ee3\u7406\u5f8b\u5e08\u738b\u5fd7\u4e1c\u8868\u793a\uff0c\u5bb6\u4eba\u5bf9\u53f8\u6cd5\u90e8\u957f\u7684\u51b3\u5b9a\u8868\u793a\u6b23\u6170\uff0c\u611f\u8c22\u4ed6\u548c\u5f53\u5730\u68c0\u5bdf\u5b98\u8003\u8651\u5e76\u5c0a\u91cd\u5bb6\u4eba\u7684\u8bf7\u6c42\uff0c\u505a\u51fa\u4e86\u4e0e\u5bb6\u4eba\u610f\u613f\u76f8\u7b26\u7684\u51b3\u5b9a\u3002\u76ee\u524d\uff0c\u539f\u5b9a\u4e8e2\u670827\u65e5\u5f00\u5ba1\u7684\u65f6\u95f4\u4e0d\u53d8\u3002

"}, "cmtId": "D8IUD7L60001899O"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/004328.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "004328", "comments": {"link": "http://coral.qq.com/2369236201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004328.htm", "title": ["2018\u6625\u8282\u9ec4\u91d1\u5468\u653e\u5047\u53bb\u54ea\u73a9\uff1f\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u4e3a\u70ed\u95e8\u76ee\u7684\u5730"], "passage": "2018\u5e74\u7684\u6625\u8282\u4e00\u5929\u5929\u4e34\u8fd1\uff0c\u867d\u7136\u8fc7\u5e74\u56de\u5bb6\u662f\u4e2d\u56fd\u4eba\u7684\u4f20\u7edf\uff0c\u800c\u636e\u4e2d\u56fd\u65c5\u6e38\u7814\u7a76\u9662\u8c03\u67e5\u663e\u793a\uff0c\u4eca\u5e74\u6625\u8282\uff0c\u5927\u5bb6\u7684\u51fa\u6e38\u610f\u613f\u4e5f\u5f88\u5f3a\u70c8\u3002\u6570\u636e\u663e\u793a\uff0c2018\u5e74\u7b2c\u4e00\u5b63\u5ea6\u5c45\u6c11\u51fa\u6e38\u610f\u613f\u4e3a83%\uff0c\u800c\u9009\u62e9\u5728\u6625\u8282\u671f\u95f4\u51fa\u6e38\u7684\u6e38\u5ba2\u5360\u4e00\u5b63\u5ea6\u6e38\u5ba2\u768448.9%\uff0c\u7814\u5b66\u3001\u6d77\u5c9b\u6e38\u3001\u6e38\u8f6e\u6e38\u3001\u51b0\u96ea\u6e38\u3001\u4eb2\u5b50\u5bb6\u5ead\u6e38\u3001\u4e3b\u9898\u6e38\u5e02\u573a\u70ed\u5ea6\u8f83\u9ad8\u3002\u60a8\u4eca\u5e74\u6709\u4ec0\u4e48\u51fa\u6e38\u8ba1\u5212\u5417\uff1f\u6211\u53ef\u80fd\u4f1a\u53bb\u897f\u5b89\u90a3\u8fb9\uff0c\u56e0\u4e3a\u90a3\u8fb9\u53ef\u80fd\u5e74\u5473\u4f1a\u6bd4\u8f83\u91cd\u3002\u6211\u4e00\u822c\u60f3\u53bb\u4e09\u4e9a\uff0c\u56e0\u4e3a\u5317\u65b9\u7279\u522b\u51b7\uff0c\u5357\u65b9\u6bd4\u8f83\u70ed\uff0c\u6bd4\u8f83\u8212\u670d\u4e00\u70b9\u3002\u6625\u8282\u671f\u95f4\uff0c\u9009\u62e9\u56fd\u5185\u8de8\u7701\u5e02\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a65.9%\uff0c\u56fd\u5185\u70ed\u95e8\u57ce\u5e02\u5305\u62ec\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u3001\u676d\u5dde\u3001\u53a6\u95e8\u7b49\uff0c\u9009\u62e9\u8fd1\u90ca\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a34.5%\u3002\u8c03\u67e5\u663e\u793a\uff0c\u5ea6\u5047\u4f11\u95f2\u3001\u89c2\u5149\u65c5\u6e38\u548c\u63a2\u9669\u662f\u5c45\u6c11\u6625\u8282\u51fa\u6e38\u7684\u4e3b\u8981\u52a8\u673a\u3002\u517b\u751f\u548c\u8fd0\u52a8\u4e3a\u4e3b\u7684\u5065\u5eb7\u6e38\u5c06\u6210\u4e3a\u4eca\u5e74\u7684\u65b0\u5ba0\uff0c\u65c5\u6e38\u53d1\u5c55\u6b63\u5728\u5411\u4e2d\u9ad8\u7ea7\u6f14\u5316\u3002\u5728\u5168\u57df\u65c5\u6e38\u65f6\u4ee3\uff0c\u90a3\u5b9e\u9645\u4e0a\u5e7f\u5927\u7684\u6e38\u5ba2\uff0c\u8d8a\u6765\u8d8a\u591a\u7684\u6e17\u900f\u5230\u65c5\u6e38\u76ee\u7684\u5730\u7684\u751f\u6d3b\u65b9\u5f0f\u548c\u4f11\u95f2\u7a7a\u95f4\u91cc\u53bb\u4e86\u3002\u5927\u5bb6\u8d8a\u6765\u8d8a\u5f3a\u8c03\u65c5\u6e38\u7684\u54c1\u8d28\u4e86\uff0c\u4e8b\u5b9e\u4e0a\u6211\u4eec2018\u5e74\u56fd\u5bb6\u65c5\u6e38\u5de5\u4f5c\u7684\u4e3b\u9898\u5c31\u662f\u4f18\u8d28\u65c5\u6e38\u5e74\u3002"}} -------------------------------------------------------------------------------- /机器学习入门/监督/分类/人体运动状态信息评级.py: -------------------------------------------------------------------------------- 1 | # SVM 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.preprocessing import Imputer #预处理模块 7 | from sklearn.model_selection import train_test_split #生成数据模块 8 | from sklearn.metrics import classification_report #评估模块 9 | # 导入分类器模块 10 | from sklearn.neighbors import KNeighborsClassifier 11 | from sklearn.tree import DecisionTreeClassifier 12 | from sklearn.naive_bayes import GaussianNB 13 | 14 | # 数据处理,传入特征列表,和标签列表 15 | def load_datasets(feature_paths, label_paths): 16 | feature = np.ndarray(shape=(0,41)) # 列41,特征维度41 (想象成一个41维的列向量) 17 | label = np.ndarray(shape=(0,1)) # 列1,标签维度1 18 | for file in feature_paths: 19 | file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/'+file 20 | df = pd.read_table(file, delimiter=',', na_values='?', header=None) 21 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) 22 | imp.fit(df) 23 | df = imp.transform(df) 24 | feature = np.concatenate((feature, df)) 25 | 26 | for file in label_paths: 27 | file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/' + file 28 | df = pd.read_table(file, header=None) 29 | label = np.concatenate((label, df)) 30 | 31 | label = np.ravel(label) 32 | return feature, label 33 | 34 | 35 | if __name__ == '__main__': 36 | ''' 数据路径 ''' 37 | featurePaths = ['A/A.feature', 'B/B.feature', 'C/C.feature', 'D/D.feature', 'E/E.feature'] 38 | labelPaths = ['A/A.label', 'B/B.label', 'C/C.label', 'D/D.label', 'E/E.label'] 39 | ''' 读入数据 ''' 40 | x_train, y_train = load_datasets(featurePaths[:4], labelPaths[:4]) 41 | x_test, y_test = load_datasets(featurePaths[4:], labelPaths[4:]) 42 | x_train, x_, y_train, y_ = train_test_split(x_train, y_train, test_size=0.0) 43 | 44 | print('Start training knn') 45 | knn = KNeighborsClassifier().fit(x_train, y_train) 46 | print('Training done') 47 | answer_knn = knn.predict(x_test) 48 | print('Prediction done') 49 | 50 | print('Start training DT') 51 | dt = DecisionTreeClassifier().fit(x_train, y_train) 52 | print('Training done') 53 | answer_dt = dt.predict(x_test) 54 | print('Prediction done') 55 | 56 | print('Start training Bayes') 57 | gnb = GaussianNB().fit(x_train, y_train) 58 | print('Training done') 59 | answer_gnb = gnb.predict(x_test) 60 | print('Prediction done') 61 | 62 | print('\n\nThe classification report for knn:') 63 | print(classification_report(y_test, answer_knn)) 64 | print('\n\nThe classification report for DT:') 65 | print(classification_report(y_test, answer_dt)) 66 | print('\n\nThe classification report for Bayes:') 67 | print(classification_report(y_test, answer_gnb)) -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/012170.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "012170", "comments": {"link": "http://coral.qq.com/1687671711"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012170.htm", "title": ["\u7f8e\u56fd\u7537\u5b50\u6b32\u9886\u517b\u732b\u54aa \u610f\u5916\u4e0e\u8d70\u5931\u6570\u6708\u7231\u732b\u91cd\u9022"], "passage": "\u8d44\u6599\u56fe\uff1a\u732b\u54aa\u3002\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u4f5b\u7f57\u91cc\u8fbe\u5dde\u4e00\u540d\u7537\u5b50\u7684\u7231\u732b\u8d70\u5931\u6570\u6708\uff0c\u65e5\u524d\u8fd9\u540d\u7537\u5b50\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u5230\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\uff0c\u6253\u7b97\u9886\u517b\u732b\u54aa\uff0c\u7ed3\u679c\u7adf\u7136\u610f\u5916\u4e0e\u7231\u732b\u56e2\u5706\u3002\u6770\u514b\u68ee\u7ef4\u5c14\u7684\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\u65e5\u524d\u5728\u793e\u4ea4\u7f51\u7ad9\u4e0a\u5206\u4eab\u540d\u53eb\u201c\u90a6\u90a6\u201d(Bon Bon)\u7684\u732b\u54aa\u4e0e\u4e3b\u4eba\u4e45\u522b\u91cd\u9022\u7684\u6545\u4e8b\uff0c\u83b7\u5f97\u7f51\u53cb\u70ed\u70c8\u56de\u54cd\u3002\u6536\u5bb9\u4e2d\u5fc3\u7684\u52a8\u7269\u534f\u4f1a\u8868\u793a\uff1a\u201c\u90a6\u90a6\u4ece\u4eca\u5e7410\u6708\u521d\u5c31\u5230\u6211\u4eec\u8fd9\u8fb9\u4e86\uff0c\u6211\u4eec\u4e0d\u77e5\u9053\u4e3a\u4ec0\u4e48\uff0c\u5bf9\u5b83\u6765\u8bf4\u4e00\u76f4\u5f88\u96be\u627e\u5230\u9886\u517b\u5bb6\u5ead\u3002\u539f\u6765\uff0c\u8fd9\u5f53\u4e2d\u6709\u4e2a\u975e\u5e38\u7279\u6b8a\u7684\u7406\u7531\u3002\u201d\u4e00\u540d\u5e74\u8f7b\u7537\u5b5021\u65e5\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u6765\u5230\u8be5\u534f\u4f1a\uff0c\u6253\u7b97\u9886\u517b\u4e00\u53ea\u732b\u54aa\uff0c\u56e0\u4e3a\u4ed6\u5fc3\u7231\u7684\u732b\u54aa\u51e0\u4e2a\u6708\u4e4b\u524d\u8d70\u4e22\u4e86\uff0c\u4ed6\u60f3\u8981\u518d\u627e\u4e00\u53ea\u732b\u54aa\u6765\u966a\u4f34\u3002\u7ed3\u679c\uff0c\u8fd9\u540d\u7537\u5b50\u5728\u6536\u5bb9\u4e2d\u5fc3\u7684\u6240\u6709\u732b\u54aa\u5f53\u4e2d\uff0c\u53d1\u73b0\u4e86\u4e00\u53ea\u5bb3\u7f9e\u7684\u6df1\u8272\u5c0f\u732b\uff0c\u770b\u8d77\u6765\u8ddf\u8d70\u4e22\u597d\u51e0\u4e2a\u6708\u7684\u7231\u732b\u957f\u5f97\u5f88\u50cf\uff0c\u7ed3\u679c\u67e5\u8bc1\u4e4b\u4e0b\uff0c\u53d1\u73b0\u88ab\u5de5\u4f5c\u4eba\u5458\u53d6\u540d\u4e3a\u201c\u90a6\u90a6\u201d\u7684\u8fd9\u53ea\u6bcd\u732b\uff0c\u539f\u6765\u5c31\u81ea\u5df1\u517b\u7684\u201c\u5bc6\u65af\u8482\u201d\u3002\u5de5\u4f5c\u4eba\u5458\u8868\u793a\uff0c\u201c\u90a6\u90a6\u201d\u8d70\u5931\u4e0d\u4e45\uff0c\u5c31\u88ab\u70ed\u5fc3\u6c11\u4f17\u6361\u5230\uff0c\u9001\u6765\u6536\u5bb9\u4e2d\u5fc3\uff0c\u201c\u73b0\u5728\u5b83\u7ec8\u4e8e\u53ef\u4ee5\u56de\u5bb6\uff0c\u56e2\u5706\u8fc7\u8282\uff0c\u56de\u5230\u771f\u6b63\u5c5e\u4e8e\u5b83\u7684\u5bb6\u3002\u201d"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HBI8IF0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HBI8IF0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HBI8IF0001875P.html"}, "newsId": "D8HBI8IF0001875P", "contents": {"passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u4ea4\u901a\u8fd0\u8f93\u90e8\uff1a\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa\uff09\n

\"\u4ea4\u901a\u90e8:\u6851\u5409\u6cb9\u8f6e\u4e8b\u6545\u6551\u63f4\u96be\u5ea6\u9ad8

\n

\u4e2d\u9752\u5728\u7ebf\u5317\u4eac1\u670819\u65e5\u7535 \u4eca\u5929\u4e0b\u5348\uff0c\u4ea4\u901a\u8fd0\u8f93\u90e8\u53ec\u5f00\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u65b0\u95fb\u53d1\u5e03\u4f1a\u3002

\u4e2d\u56fd\u6d77\u4e0a\u641c\u6551\u4e2d\u5fc3\u526f\u4e3b\u4efb\u3001\u4ea4\u901a\u8fd0\u8f93\u90e8\u5e94\u6025\u529e\u4e3b\u4efb\u667a\u5e7f\u8def\u8868\u793a\uff0c\u8fd9\u6b21\u5e94\u6025\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u5f88\u9ad8\uff0c\u4e16\u754c\u822a\u8fd0\u53f2\u4e0a\u5c1a\u65e0\u6cb9\u8239\u8f7d\u8fd0\u201c\u51dd\u6790\u6cb9\u201d\u88ab\u649e\u5931\u706b\u7684\u4e8b\u6545\u53d1\u751f\uff0c\u201c\u5e94\u6025\u5904\u7f6e\u65e0\u5148\u4f8b\u53ef\u5faa\u3002\u201d

2018\u5e741\u67086\u65e5\u665a\uff0c\u5df4\u62ff\u9a6c\u7c4d\u6cb9\u8239\u201c\u6851\u5409\u201d\u8f6e\u4e0e\u4e2d\u56fd\u9999\u6e2f\u7c4d\u6563\u8d27\u8239\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u5728\u957f\u6c5f\u53e3\u4ee5\u4e1c\u7ea6160\u6d77\u91cc\u5904\u53d1\u751f\u78b0\u649e\u3002\u4e8b\u6545\u5bfc\u81f4\u201c\u6851\u5409\u201d\u8f6e\u8d27\u8239\u8d77\u706b\uff0c32\u540d\u8239\u5458\u5931\u8e2a\uff0c\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u53d7\u635f\u8d77\u706b\uff0c21\u540d\u8239\u5458\u5f03\u8239\u9003\u751f\u540e\u88ab\u9644\u8fd1\u6e14\u8239\u6551\u8d77\u3002

", "link": "http://news.163.com/18/0119/16/D8HBI8IF0001875P.html", "title": ["\u4ea4\u901a\u90e8:\"\u6851\u5409\"\u6cb9\u8f6e\u4e8b\u6545\u6551\u63f4\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HJ2GAK000187VE.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HJ2GAK000187VE", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8HJ2GAK000187VE.html"}, "newsId": "D8HJ2GAK000187VE", "contents": {"passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u53a6\u822a\u5c31\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed\u53d1\u58f0\uff1a\u4e25\u91cd\u5f71\u54cd\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\uff09\n

@\u53a6\u95e8\u822a\u7a7a\u5b98\u65b9\u5fae\u535a1\u670819\u65e5\u6d88\u606f\uff0c\u6625\u8282\u662f\u4e2d\u534e\u6c11\u65cf\u6700\u91cd\u8981\u7684\u4f20\u7edf\u8282\u65e5\u3002\u4e3a\u4e86\u6ee1\u8db32018\u5e74\u6625\u8282\u671f\u95f4\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u7684\u9700\u6c42\uff0c\u53a6\u95e8\u822a\u7a7a\u6309\u7167\u60ef\u4f8b\uff0c\u7279\u522b\u8c03\u6574\u8fd0\u529b\uff0c\u7533\u8bf7\u589e\u52a070\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\uff0c\u4e3b\u8981\u5305\u62ec\u4ece\u798f\u5dde\u3001\u53a6\u95e8\u3001\u676d\u5dde\u5f80\u8fd4\u53f0\u6e7e\u7684\u822a\u73ed\uff0c\u8ba9\u53f0\u6e7e\u540c\u80de\u53ef\u4ee5\u901a\u8fc7\u6700\u4fbf\u6377\u7684\u65b9\u5f0f\u5f80\u8fd4\u4e24\u5cb8\uff0c\u6b22\u5ea6\u65b0\u6625\u4f73\u8282\u3002\u76ee\u524d\u5df2\u6709\u8d85\u8fc71\u4e07\u540d\u65c5\u5ba2\u9884\u8ba2\u76f8\u5173\u822a\u73ed\u673a\u7968\uff0c\u9884\u8ba1\u6625\u8282\u671f\u95f4\u5c06\u6709\u8d85\u8fc72\u4e07\u540d\u65c5\u5ba2\u4e58\u5750\u53a6\u822a\u4e24\u5cb8\u52a0\u73ed\u822a\u73ed\u3002

\u76ee\u524d\uff0c\u53d7\u53f0\u6e7e\u65b9\u9762\u5e72\u9884\uff0c\u53a6\u822a\u6839\u636e\u4e24\u5cb8\u5e02\u573a\u9700\u6c42\u7533\u8bf7\u768470\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\u53ef\u80fd\u65e0\u6cd5\u6267\u884c\uff0c\u8fd9\u5c06\u4e25\u91cd\u5f71\u54cd\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u4e0e\u4eb2\u4eba\u56e2\u805a\u7684\u884c\u7a0b\u5b89\u6392\u3002\u6b64\u4e3e\u7ed9\u822a\u4f01\u9020\u6210\u7684\u7ecf\u6d4e\u635f\u5931\u4e8b\u5c0f\uff0c\u7ed9\u4e24\u5cb8\u6c11\u4f17\u5f80\u6765\u5e26\u6765\u7684\u6781\u5927\u4e0d\u4fbf\u4e8b\u5927\u3002

\u53a6\u822a\u81ea\u6210\u7acb\u4ee5\u6765\u4fbf\u4ee5\u201c\u670d\u52a1\u4e24\u5cb8\u201d\u4e3a\u4f7f\u547d\uff0c\u6210\u4e3a\u4e24\u5cb8\u76f4\u822a\u7684\u53c2\u4e0e\u8005\u3001\u89c1\u8bc1\u8005\u548c\u63a8\u8fdb\u8005\uff0c\u5728\u6d77\u5ce1\u4e24\u5cb8\u4e4b\u95f4\u67b6\u8d77\u4e86\u4fbf\u6377\u7684\u7a7a\u4e2d\u6865\u6881\u3002\u5728\u6b64\u5f3a\u70c8\u547c\u5401\u53f0\u6e7e\u6709\u5173\u90e8\u95e8\u80fd\u591f\u987a\u5e94\u6c11\u610f\uff0c\u6ee1\u8db3\u6c11\u4f17\u8feb\u5207\u9700\u6c42\uff0c\u4e3a\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u63d0\u4f9b\u4fbf\u5229\u3002

\n

", "link": "http://news.163.com/18/0119/18/D8HJ2GAK000187VE.html", "title": ["\u53a6\u822a\u56de\u5e94\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed:\u4e25\u91cd\u5f71\u54cd\u8fd4\u4e61"]}} -------------------------------------------------------------------------------- /泰迪杯尝试/爬取相似URL/从主页获得相似URL初步可执行代码.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 22:50 4 | # @Author : mazicwong 5 | # @File : 1.爬取相似url(最终).py 6 | 7 | import urllib.request 8 | import re 9 | import os 10 | from bs4 import BeautifulSoup 11 | 12 | 13 | # 获得主页html 14 | def get_root_html(url): 15 | # 在主页下面get新的html 16 | headers = { 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 18 | } 19 | req = urllib.request.Request(url=url, headers=headers) 20 | response = urllib.request.urlopen(req, timeout=2) 21 | html = response.read() 22 | return html 23 | 24 | 25 | def get_re(url): 26 | url = url[7:] # 去除http:// 27 | Len = len(url) 28 | p = "http://" 29 | i = 0 30 | while i < Len: 31 | if url[i] == '.': 32 | p += '.' 33 | elif 'a' <= url[i] <= 'z': # 不能直接判isplpha,因为str[i]中全都是字符 34 | p += '[a-z]' 35 | elif '0' <= url[i] <= 'z': 36 | p += '\d' 37 | else: 38 | p += url[i] 39 | i += 1 40 | return p 41 | 42 | 43 | # 获取该url数据,分为获取本身和相似url 44 | def main(): 45 | with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\bbs_urls.txt", "r") as file: 46 | urlList = file.readlines() 47 | cnt = 1 48 | # path = r'E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果' #用来判断文件是否已经存在 49 | for url in urlList: 50 | # if os.path.isfile('out%s.txt'%cnt): #存在且不为空就退出 51 | # if os. 52 | # cnt +=1 53 | # continue 54 | 55 | # 以下:get主页url http://www.baidu.com/abc/cc ==>> www.baidu.com 56 | m = url.split('//') 57 | if len(m) == 2: 58 | root_url = m[1] 59 | else: 60 | root_url = m[0] 61 | tt = root_url.split('/') 62 | root_url = tt[0] 63 | root_url = r'http://' + root_url 64 | # getHtml(url, cnt) 65 | # print(root_url) 66 | root_html = get_root_html(root_url) # 获得主页html 67 | p1 = get_re(url) # 获取正则表达式 68 | # print(p1) 69 | # print(type(p1)) 70 | p1 = p1.encode(encoding='utf-8') # it can help transfer the "string" to "bytes" 71 | p1 = p1[:-1] #去掉换行符 72 | # print(p1) 73 | # print(type(p1)) 74 | pat = re.compile(p1) # 编译正则表达式 75 | List = re.findall(pat, root_html) 76 | print(len(List)) 77 | # for i in List: 78 | # print(i) 79 | path = r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\out%s.txt" % cnt 80 | with open(path, "w") as f: 81 | for i in List: 82 | i = i.decode() 83 | i = str(i) 84 | f.write(i) 85 | f.write('\n') 86 | cnt += 1 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/003365.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "003365", "comments": {"link": "http://coral.qq.com/2369196525"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/003365.htm", "title": ["\u6e56\u5357\u4e00\u5973\u533b\u751f\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u83b7\u8d5e\u201c\u6700\u7f8e\u201d\uff1a\u5c0f\u75c5\u90fd\u575a\u6301"], "passage": "\u6e56\u5357\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u4e00\u5973\u533b\u751f1\u670818\u65e5\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\uff0c\u83b7\u8d5e\u201c\u5b81\u4e61\u6700\u7f8e\u533b\u751f\u201d\u3002\u5f53\u4e8b\u533b\u751f\u7a0b\u52291\u670819\u65e5\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\uff0c\u201c\u50cf\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u6211\u4eec\u5de5\u4f5c\uff0c\u6211\u4eec\u57fa\u672c\u90fd\u4f1a\u575a\u6301\u4e0a\u73ed\u3002\u201d\u636e\u4e86\u89e3\uff0c\u7a0b\u5229\u662f\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u5987\u4ea7\u79d1\u4e3b\u4efb\uff0c\u5728\u8fd9\u91cc\u5df2\u7ecf\u5de5\u4f5c\u4e86\u4e03\u5e74\u30021\u670818\u65e5\uff0c\u56e0\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u4f46\u53c8\u8f6e\u5230\u503c\u73ed\uff0c\u4e8e\u662f\u5979\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u3002\u636e\u6e56\u5357\u7ecf\u89c6\u6b64\u524d\u62a5\u9053\uff0c\u7a0b\u5229\u5de6\u624b\u6253\u7740\u70b9\u6ef4\uff0c\u53f3\u624b\u62ff\u7740\u7b14\u5199\u5b57\uff0c\u5750\u5728\u529e\u516c\u684c\u524d\u7ed9\u75c5\u4eba\u770b\u75c5\u3002\u6b64\u5916\uff0c\u7a0b\u5229\u4f1a\u7528\u53f3\u624b\u4e3e\u7740\u70b9\u6ef4\u74f6\uff0c\u7136\u540e\u5230\u75c5\u623f\u53bb\u67e5\u623f\uff0c\u8be2\u95ee\u60a3\u8005\u60c5\u51b5\u3002\u4e00\u4f4d\u60a3\u8005\u8bf4\uff1a\u201c\u5979\u4e00\u76f4\u575a\u6301\u5728\u8fd9\u8fb9\uff0c\u4e3a\u6211\u4eec\u75c5\u4eba\u7740\u60f3\uff0c\u6211\u89c9\u5f97\u5979\u662f\u5b81\u4e61\u6700\u7f8e\u7684\u533b\u751f\u3002\u201d\u201c\u56e0\u4e3a\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u6211\u5df2\u7ecf\u6253\u4e86\u56db\u5929\u7684\u70b9\u6ef4\u3002\u521a\u597d\u8fd9\u51e0\u5929\uff0c\u6211\u4eec\u79d1\u5ba4\u6bd4\u8f83\u5fd9\uff0c\u6709\u4e00\u4f4d\u4ea7\u540e\u5927\u51fa\u8840\u7684\u90fd\u5728\u6211\u4eec\u8fd9\u91cc\u62a2\u6551\uff0c\u7d2f\u8fd8\u662f\u6bd4\u8f83\u7d2f\u3002\u6211\u4eec\u5728\u57fa\u5c42\u4e0a\u73ed\uff0c\u4eba\u5458\u90fd\u6bd4\u8f83\u7d27\u5f20\uff0c\u5206\u5de5\u4e5f\u4e0d\u90a3\u4e48\u7ec6\u5316\uff0c\u8981\u505a\u7684\u4e8b\u60c5\u5f88\u591a\uff0c\u50cf\u6211\u4eec\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u5de5\u4f5c\uff0c\u6211\u4eec\u8fd8\u662f\u4f1a\u575a\u6301\u4e0a\u73ed\u7684\u3002\u201d\u7a0b\u5229\u8bf4\u3002\u5bf9\u4e8e\u83b7\u8d5e\u201c\u6700\u7f8e\u533b\u751f\u201d\u79f0\u53f7\uff0c\u7a0b\u5229\u8868\u793a\uff1a\u201c\u6700\u7f8e\u533b\u751f\u771f\u7684\u4e0d\u6562\u5f53\uff0c\u6bcf\u4e00\u4e2a\u804c\u4e1a\u90fd\u6709\u804c\u4e1a\u7684\u672c\u80fd\uff0c\u6211\u4eec\u4e34\u5e8a\u6709\u597d\u591a\u8fd9\u6837\u7684\u533b\u751f\uff0c\u575a\u6301\u4ee5\u75c5\u4eba\u4e3a\u672c\uff0c\u5162\u5162\u4e1a\u4e1a\uff0c\u606a\u5b88\u5c97\u4f4d\u3002\u201d"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/010551.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "010551", "comments": {"link": "http://coral.qq.com/2369832377"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010551.htm", "title": ["\u5e7f\u897f\u5317\u6d77\u8054\u5408\u884c\u52a8\u961f\u6293\u83b747\u540dA\u7ea7\u4f20\u9500\u5934\u76ee \u51bb\u7ed3\u8d854\u5343\u4e07\u5143"], "passage": "\u6628\u5929\uff0819\u65e5\uff09\u51cc\u66683\u70b9\uff0c\u5728\u5e7f\u897f\u5317\u6d77\u5e02\uff0c\u7531\u516c\u5b89\u3001\u5de5\u5546\u3001\u57ce\u7ba1\u7b49\u90e8\u95e8680\u4f59\u540d\u6267\u6cd5\u4eba\u5458\u7ec4\u6210\u8054\u5408\u884c\u52a8\u961f\uff0c\u91cd\u70b9\u6e05\u67e5\u6d89\u5acc\u7ec4\u7ec7\u4f20\u9500\u6d3b\u52a8\u7684\u5934\u76ee\u548c\u4f20\u9500\u9aa8\u5e72\u5206\u5b50\u3001\u53c2\u52a0\u201c\u8d44\u672c\u8fd0\u4f5c\u201d\u3001\u201c\u4e00\u65e5\u6e38\u201d\u7b49\u4f20\u9500\u6d3b\u52a8\u7684\u6d89\u4f20\u4eba\u5458\uff0c\u6b64\u6b21\u4e13\u9879\u884c\u52a8\u5171\u6293\u83b7A\u7ea7\u53ca\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff0c\u51bb\u7ed3\u8d44\u91d1\u7ea64200\u4e07\u5143\u3002\u5f53\u5929\u51cc\u6668\uff0c\u6267\u6cd5\u4eba\u5458\u8fdb\u5165\u5317\u6d77\u5e02\u590f\u65e5\u6d77\u6e7e\u5c0f\u533a\uff0c\u5bf9\u524d\u671f\u6478\u6392\u51fa\u7684\u6d89\u5acc\u4ece\u4e8b\u4f20\u9500\u6d3b\u52a8\u7684100\u591a\u4e2a\u623f\u95f4\u8fdb\u884c\u6e05\u67e5\u6574\u6cbb\u884c\u52a8\u3002\u6b64\u5916\uff0c\u6267\u6cd5\u4eba\u5458\u8fd8\u5206\u522b\u5bf9\u5317\u6d77\u5e02\u533a\u7684\u5317\u6d77\u5723\u7687\u5e7f\u573a\u3001\u6850\u6d0b\u65b0\u57ce\u4e24\u4e2a\u5c0f\u533a\u8fdb\u884c\u6e05\u67e5\u884c\u52a8\u3002\u5171\u6e05\u67e5\u51fa\u79df\u5c4b80\u591a\u95f4\uff0c\u67e5\u83b7\u6d89\u5acc\u4f20\u9500\u4eba\u5458100\u591a\u540d\uff0c\u4ee5\u53ca\u4e00\u6279\u6d89\u5acc\u4f20\u9500\u8fdd\u6cd5\u884c\u4e3a\u7684\u4e66\u7c4d\u548c\u7269\u54c1\u3002\u57fa\u672c\u6bcf\u4e00\u4e2a\u623f\u95f4\u6211\u4eec\u90fd\u6e05\u67e5\u51fa\u6d89\u4f20\u4eba\u5458\uff0c\u4e24\u4e2a\u5c0f\u533a\u4e00\u5171\u6e05\u67e5\u4e86100\u591a\u4e2a\u6d89\u4f20\u4eba\u5458\uff0c\u4e0b\u4e00\u6b65\u6211\u4eec\u6839\u636e\u72af\u7f6a\u7684\uff0c\u6d89\u53ca\u7ec4\u7ec7\u9886\u5bfc\u4f20\u9500\u7f6a\u7684 \u4f9d\u6cd5\u6253\u51fb\uff0c\u6e05\u67e5\u6ca1\u6709\u6784\u6210\u72af\u7f6a\u7684\u6211\u4eec\u7ecf\u8fc7\u6559\u80b2\u3001\u8bad\u8beb\u7136\u540e\u505a\u5176\u4ed6\u76f8\u5e94\u7684\u5904\u7406\u3002\u636e\u4e86\u89e3\uff0c\u5728\u8fd9\u6b21\u4e13\u9879\u884c\u52a8\u4e2d\uff0c\u6267\u6cd5\u4eba\u5458\u9664\u4e86\u5bf9\u4f20\u9500\u4eba\u5458\u805a\u96c6\u8f83\u591a\u7684\u5c0f\u533a\u8fdb\u884c\u5730\u6bef\u5f0f\u6e05\u67e5\u5916\uff0c\u8fd8\u7ec4\u7ec7\u8b66\u529b\u5206\u522b\u5728\u5185\u8499\u53e4\u3001\u4e91\u5357\u3001\u5e7f\u897f\u540c\u65f6\u8fdb\u884c\u6293\u6355\u884c\u52a8\u3002\u622a\u81f3\u6628\u5929\uff0819\u65e5\uff09\u4e0a\u5348\uff0c\u5df2\u6293\u83b7A\u7ea7\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff1b\u5317\u6d77\u5e02\u5171\u6e05\u67e5\u51fa\u79df\u5c4b212\u95f4\uff0c\u67e5\u5c01\u6d89\u4f20\u51fa\u79df\u5c4b148\u6237\uff0c\u67e5\u83b7\u6d89\u4f20\u4eba\u5458459\u540d\u4ee5\u53ca\u5927\u91cf\u624b\u673a\u3001\u7535\u8111\u3001\u4f20\u9500\u4e66\u7c4d\u7b49\u6d89\u6848\u7269\u54c1\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl' 13 | 14 | SPIDER_MODULES = ['crawl.spiders'] 15 | NEWSPIDER_MODULE = 'crawl.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'crawl.middlewares.CrawlSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'crawl.middlewares.CrawlDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'crawl.pipelines.CrawlPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutotial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tutotial' 13 | 14 | SPIDER_MODULES = ['tutotial.spiders'] 15 | NEWSPIDER_MODULE = 'tutotial.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tutotial.middlewares.TutotialSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tutotial.middlewares.TutotialDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'tutotial.pipelines.TutotialPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | FEED_URI = u'/home/mazic/pp/jian.csv' 92 | FEED_FORMAT = 'CSV' 93 | -------------------------------------------------------------------------------- /泰迪杯尝试/数据爬取(去标签).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 15:53 4 | # @Author : mazicwong 5 | # @File : 数据爬取(去标签).py 6 | 7 | # 用正则表达式简单过滤html的标签 8 | import re 9 | 10 | 11 | def filter_tags(htmlstr): 12 | re_cdata = re.compile('//]*//\]\]>', re.I) # 匹配CDATA 13 | re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script 14 | re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style 15 | re_br = re.compile('') # 处理换行 16 | re_h = re.compile(']*>') # HTML标签 17 | re_comment = re.compile('') # HTML注释 18 | s = re_cdata.sub('', htmlstr) # 去掉CDATA 19 | s = re_script.sub('', s) # 去掉SCRIPT 20 | s = re_style.sub('', s) # 去掉style 21 | s = re_br.sub('\n', s) # 将br转换为换行 22 | s = re_h.sub('', s) # 去掉HTML 标签 23 | s = re_comment.sub('', s) # 去掉HTML注释 24 | # 去掉多余的空行 25 | blank_line = re.compile('\n+') 26 | s = blank_line.sub('\n', s) 27 | s = replaceCharEntity(s) # 替换实体 28 | return s 29 | 30 | 31 | ##替换常用HTML字符实体. 32 | # 使用正常的字符替换HTML中特殊的字符实体. 33 | # 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体. 34 | # @param htmlstr HTML字符串. 35 | def replaceCharEntity(htmlstr): 36 | CHAR_ENTITIES = {'nbsp': ' ', '160': ' ', 37 | 'lt': '<', '60': '<', 38 | 'gt': '>', '62': '>', 39 | 'amp': '&', '38': '&', 40 | 'quot': '"''"', '34': '"', } 41 | 42 | re_charEntity = re.compile(r'&#?(?P\w+);') 43 | sz = re_charEntity.search(htmlstr) 44 | while sz: 45 | entity = sz.group() # entity全称,如> 46 | key = sz.group('name') # 去除&;后entity,如>为gt 47 | try: 48 | htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1) 49 | sz = re_charEntity.search(htmlstr) 50 | except KeyError: 51 | # 以空串代替 52 | htmlstr = re_charEntity.sub('', htmlstr, 1) 53 | sz = re_charEntity.search(htmlstr) 54 | return htmlstr 55 | 56 | 57 | def repalce(s, re_exp, repl_string): 58 | return re_exp.sub(repl_string, s) 59 | 60 | 61 | ''' 62 | def saveFile(news,cnt): 63 | path = r'E:\泰迪杯\C题样例数据\All_html 去标签\out%s.txt' % cnt 64 | file = open(path, 'w+') 65 | file.write(news) 66 | file.close() 67 | 68 | if __name__ == '__main__': 69 | for cnt in range(1, 178): 70 | try: 71 | path1 = r'E:\泰迪杯\C题样例数据\All_html\out%s.txt' % cnt 72 | file = open(path1, 'r') 73 | text = file.read() 74 | news = filter_tags(text) 75 | saveFile(news,cnt) 76 | file.close() 77 | except: 78 | print("第%s文件不存在"%cnt) 79 | ''' 80 | 81 | 82 | def saveFile(news, cnt): 83 | path = r'E:\泰迪杯\C题样例数据\All_html 相似url\66out%d.txt' % cnt 84 | file = open(path, 'w+') 85 | file.write(news) 86 | file.close() 87 | 88 | 89 | #UnicodeDecodeError: 'gbk' codec can't decode byte 0xaf in position 641: illegal multibyte sequence 90 | #上面在liaoxuefeng提到了,可以直接忽略他 91 | 92 | if __name__ == '__main__': 93 | for cnt in [1,-1]: 94 | try: 95 | path1 = r'E:\泰迪杯\C题样例数据\All_html 相似url\out%s.txt' % cnt 96 | #读取一直错误。。改了半个小时终于改成功了 97 | #把下面mode = 'r' 改成 'rb', 因为r的时候读进来是gbk..但是也不知道为什么转换不了。。直接读二进制文件吧 98 | #明天再把编码问题好好看一看 99 | #第二个改动是decode('utf-8') 100 | file = open(path1, 'rb') 101 | text = file.read().decode('utf-8') 102 | news = filter_tags(text) 103 | saveFile(news, cnt) 104 | file.close() 105 | except: 106 | print("第%s文件不存在" % cnt) 107 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HAH1VS0001875P.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8HAH1VS0001875P", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HAH1VS0001875P.html"}, "contents": {"title": ["\u7537\u5b50\u6df1\u591c\u5c06\u5973\u5b50\u62b1\u81f3\u575f\u5730\u5f3a\u5978 \u4e8b\u540e\u6b32\u706d\u53e3\u7528\u7816\u7838\u5934"], "link": "http://news.163.com/18/0119/15/D8HAH1VS0001875P.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u6df1\u591c\u730e\u8273\u5f3a\u5978\u5987\u5973 \u62c5\u5fc3\u88ab\u544a\u53d1\u7528\u6c34\u6ce5\u7816\u7838\u5934\u706d\u53e3\uff09\n

\u6b63\u4e49\u7f511\u670819\u65e5\u7535 \u201c\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0,\u4f60\u5bf9\u4e00\u5ba1\u5224\u51b3\u8ba4\u5b9a\u7684\u72af\u7f6a\u4e8b\u5b9e\u548c\u8bc1\u636e\u662f\u5426\u6709\u5f02\u8bae?\u201d\u8fd9\u662f\u4e00\u8d77\u7531\u6842\u6797\u5e02\u4eba\u6c11\u68c0\u5bdf\u9662\u4ee5\u5ba1\u5224\u76d1\u7763\u7a0b\u5e8f\u5411\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u63d0\u51fa\u6297\u8bc9\u7684\u66b4\u529b\u5211\u4e8b\u6848\u4ef6\u3002\u4e00\u5ba1\u6cd5\u9662\u4ee5\u5f3a\u5978\u7f6a\u5224\u5904\u949f\u67d0\u67d0\u6709\u671f\u5f92\u5211\u4e09\u5e74\u4e03\u4e2a\u6708\u3001\u4ee5\u6545\u610f\u6740\u4eba\u7f6a\u4ec5\u5224\u5904\u5176\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e00\u5e74\u516d\u4e2a\u6708\u3002

\u8bf4\u8d77\u8fd9\u8d77\u6848\u4ef6\u90a3\u5c31\u8981\u8ffd\u6eaf\u5230\u51e0\u5e74\u524d\u4e86\u30022015\u5e742\u670825\u65e5\u51cc\u6668,\u56db\u5904\u6e38\u8361\u51c6\u5907\u730e\u8273\u7684\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0\u6ee1\u8138\u5931\u671b,\u9a7e\u9a76\u6469\u6258\u8f66\u6162\u60a0\u60a0\u5730\u5f80\u5bb6\u8d70,\u884c\u81f3\u8354\u6d66\u53bf\u67d0\u9547\u67d0\u8857,\u949f\u67d0\u67d0\u773c\u524d\u7a81\u7136\u4e00\u4eae,\u524d\u9762\u4ece\u9ebb\u5c06\u9986\u51fa\u95e8\u6b63\u72ec\u81ea\u6b65\u884c\u56de\u5bb6\u7684\u9ec4\u67d0\u67d0\u6b63\u9002\u5408\u4e0b\u624b\u554a!

\n

\u949f\u67d0\u67d0\u9042\u8d76\u4e0a\u524d\u4e3b\u52a8\u63d0\u51fa\u642d\u8f7d\u9ec4\u67d0\u67d0\u56de\u5bb6\u3002\u884c\u81f3\u504f\u50fb\u8def\u6bb5\u949f\u67d0\u67d0\u4fbf\u63d0\u51fa\u8981\u4e0e\u9ec4\u67d0\u67d0\u53d1\u751f\u6027\u5173\u7cfb,\u88ab\u62d2\u7edd\u540e\u949f\u67d0\u67d0\u76f4\u63a5\u5c06\u6b32\u9003\u8dd1\u7684\u9ec4\u67d0\u67d0\u6402\u62b1\u81f3\u8def\u8fb9\u575f\u5730,\u4e0d\u987e\u5bd2\u98ce\u51db\u51bd,\u5f3a\u884c\u5bf9\u9ec4\u67d0\u67d0\u5b9e\u65bd\u5978\u6deb\u3002\u4e8b\u6bd5,\u5fc3\u6ee1\u610f\u8db3\u7684\u949f\u67d0\u67d0\u62c5\u5fc3\u9ec4\u67d0\u67d0\u544a\u53d1,\u9042\u51b3\u5b9a\u706d\u53e3\u3002\u5728\u5c06\u9ec4\u67d0\u67d0\u6390\u6655\u540e,\u949f\u67d0\u67d0\u53cc\u624b\u4ece\u575f\u5806\u65c1\u642c\u8d77\u4e00\u5757\u6c34\u6ce5\u7816\u5f84\u76f4\u8fde\u7eed\u7838\u5411\u9ec4\u67d0\u67d0\u7684\u5934\u90e8\u2026\u2026

\u5341\u4f59\u5c0f\u65f6\u540e\u9ec4\u67d0\u67d0\u88ab\u4eba\u53d1\u73b0\u5e76\u83b7\u6551\u3002\u7ecf\u6cd5\u533b\u9274\u5b9a,\u88ab\u5bb3\u4eba\u9ec4\u67d0\u67d0\u7684\u4eba\u4f53\u635f\u4f24\u7a0b\u5ea6\u6784\u6210\u91cd\u4f24\u4e8c\u7ea7,\u5934\u9762\u90e8\u7684\u4eba\u4f53\u635f\u4f24\u6b8b\u75be\u7a0b\u5ea6\u5c5e\u516d\u7ea7\u6b8b\u75be\u3002

\u5ead\u540e\u4e00\u5468,\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u4f5c\u51fa\u7ec8\u5ba1\u5224\u51b3,\u7ef4\u6301\u539f\u5ba1\u6cd5\u9662\u5bf9\u949f\u67d0\u67d0\u5f3a\u5978\u7f6a\u7684\u91cf\u5211,\u5c06\u5176\u6545\u610f\u6740\u4eba\u7f6a\u7684\u91cf\u5211\u7531\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708\u6539\u5224\u4e3a\u6709\u671f\u5f92\u5211\u5341\u4e94\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e03\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74\u3002

"}, "cmtId": "D8HAH1VS0001875P"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20160721/BSH7V8QF00014JB6.json: -------------------------------------------------------------------------------- 1 | {"newsId": "BSH7V8QF00014JB6", "date": "20160721", "source": "netease", "comments": {"link": "http://comment.news.163.com/news3_bbs/BSH7V8QF00014JB6.html"}, "contents": {"title": ["\u8fbd\u5b81\u906d\u66b4\u96e8\u4fb5\u88ad\u81f4\u57ce\u5e02\u5185\u6d9d \u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u4eba"], "link": "http://news.163.com/16/0721/19/BSH7V8QF00014JB6.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u8fbd\u5b81\u906d\u9047\u66b4\u96e8\u4fb5\u88ad\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\uff09\n

\u4e2d\u65b0\u793e\u6c88\u96337\u670821\u65e5\u7535 2016\u5e74\u5165\u6c5b\u4ee5\u6765\u6700\u5f3a\u964d\u96e821\u65e5\u4fb5\u88ad\u8fbd\u5b81\uff0c\u9020\u6210\u519c\u7530\u53d7\u707e\u57ce\u5e02\u5185\u6d9d\uff0c\u8be5\u7701\u5df2\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\u3002

\u4ece7\u670820\u65e5\u665a\u5f00\u59cb\uff0c\u672c\u8f6e\u5927\u8303\u56f4\u66b4\u96e8\u5f00\u59cb\u5728\u8fbd\u5b81\u897f\u90e8\u5730\u533a\u8086\u8650\uff0c\u6cbf\u6d77\u90e8\u5206\u6cb3\u6d41\u53d1\u751f\u6d2a\u6c34\uff0c\u81f321\u65e5\u964d\u96e8\u8303\u56f4\u6269\u6563\u5230\u8fbd\u5b81\u5168\u5883\u3002

\u4e2d\u65b0\u793e\u8bb0\u800521\u65e5\u5728\u7701\u4f1a\u6c88\u9633\u770b\u5230\uff0c\u5929\u7a7a\u9634\u6c89\u72b9\u5982\u508d\u665a\uff0c\u5927\u96e8\u503e\u76c6\u800c\u4e0b\uff0c\u5728\u4e00\u4e9b\u79ef\u6c34\u4e25\u91cd\u7684\u8857\u8def\u4e0a\uff0c\u6d88\u9632\u4eba\u5458\u51fa\u52a8\u76ae\u5212\u8247\u8fd0\u8f7d\u53d7\u56f0\u6c11\u4f17\u3002\u5728\u846b\u82a6\u5c9b\u5e02\uff0c\u90e8\u5206\u5730\u533a\u964d\u96e8\u91cf\u7a81\u7834\u6709\u6c14\u8c61\u8bb0\u5f55\u4ee5\u6765\u7684\u5386\u53f2\u6781\u503c\uff0c\u4e0d\u65ad\u6709\u8f66\u8f86\u5728\u79ef\u6c34\u91cc\u629b\u951a\uff0c\u5f53\u5730\u8fb9\u9632\u5b98\u5175\u8fde\u591c\u8f6c\u79fb\u4e8688\u540d\u8f96\u533a\u6c11\u4f17\u3002

\u636e\u8fbd\u5b81\u7701\u9632\u6c5b\u6297\u65f1\u6307\u6325\u90e8\u4ecb\u7ecd\uff0c\u622a\u81f3\u76ee\u524d\uff0c\u6c14\u8c61\u90e8\u95e8\u5df2\u63a5\u8fde\u53d1\u5e03\u66b4\u96e8\u7ea2\u8272\u9884\u8b667\u4e2a\uff0c\u66b4\u96e8\u6a59\u8272\u9884\u8b6616\u4e2a\uff0c\u5168\u7701\u6700\u5927\u964d\u6c34\u91cf\u51fa\u73b0\u5728\u846b\u82a6\u5c9b\u5e02\u7ee5\u4e2d\u53bf\uff0c\u8fbe\u5230396\u6beb\u7c73\u3002

\n

\u53d7\u5f3a\u964d\u96e8\u5f71\u54cd\uff0c\u622a\u81f37\u670821\u65e515\u65f6\u8bb8\uff0c\u8fbd\u5b81\u5168\u7701\u8d85\u6c5b\u9650\u6c34\u4f4d\u8fd0\u884c\u7684\u6c34\u5e93\u670930\u5ea7\uff0c\u5176\u4e2d\u5927\u4e2d\u578b\u6c34\u5e933\u5ea7\u300221\u65e5\uff0c\u8fbd\u5b8130\u5ea7\u5927\u578b\u6c34\u5e93\u603b\u84c4\u6c34\u91cf\u4e3a33.41\u4ebf\u7acb\u65b9\u7c73\uff0c\u6bd42015\u5e74\u540c\u671f\u591a5.76\u4ebf\u7acb\u65b9\u7c73\u3002

\u76ee\u524d\uff0c\u8fbd\u5b81\u846b\u82a6\u5c9b\u5e02\u670925\u4e2a\u4e61\u9547\u53d7\u707e\uff0c\u5012\u584c\u623f\u5c4b28\u95f4\uff0c\u519c\u4f5c\u7269\u53d7\u707e\u9762\u79ef39.2\u4e07\u4ea9\uff0c\u635f\u6bc1\u5824\u96320.8\u516c\u91cc\uff0c\u76f4\u63a5\u7ecf\u6d4e\u635f\u59311900\u4e07\u5143\u4eba\u6c11\u5e01\u3002\u5176\u4ed6\u5730\u533a\u707e\u60c5\u6b63\u5728\u8fdb\u4e00\u6b65\u6838\u5b9e\u4e2d\u3002\u672c\u8f6e\u66b4\u96e8\u8fbd\u5b81\u5171\u8f6c\u79fb12\u4e2a\u5e02\u7684\u6c11\u4f1712.59\u4e07\u4eba\uff0c\u6682\u65f6\u6ca1\u6709\u6536\u5230\u4eba\u5458\u4f24\u4ea1\u62a5\u544a\u3002

\u7a81\u5982\u5176\u6765\u7684\u66b4\u96e8\u4ea6\u4f7f\u4ea4\u901a\u51fa\u884c\u53d7\u5230\u4e25\u91cd\u5f71\u54cd\uff0c\u8fbd\u5b81\u5883\u518516\u6761\u9ad8\u901f\u516c\u8def\u5c01\u95ed\u6216\u9650\u884c\uff1b39\u8d9f\u65c5\u5ba2\u5217\u8f66\u4e34\u65f6\u505c\u8fd0\uff1b\u6cbf\u6d77\u6e2f\u53e3\u53d7\u5927\u98ce\u5f71\u54cd\u90e8\u5206\u73ed\u6b21\u505c\u822a\u3002

\u6c14\u8c61\u90e8\u95e8\u9884\u8ba1\uff0c22\u65e5\u8fbd\u4e1c\u5730\u533a\u7684\u672c\u6eaa\u3001\u4e39\u4e1c\u7b49\u5730\u8fd8\u5c06\u7ee7\u7eed\u906d\u9047\u66b4\u96e8\u3002

"}, "cmtId": "BSH7V8QF00014JB6"} -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jnuxshc project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jnuxshc' 13 | 14 | SPIDER_MODULES = ['jnuxshc.spiders'] 15 | NEWSPIDER_MODULE = 'jnuxshc.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | #DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'jnuxshc.middlewares.JnuxshcSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'jnuxshc.middlewares.JnuxshcDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 66 | #ITEM_PIPELINES = { 67 | # 'jnuxshc.pipelines.JnuxshcPipeline': 300, 68 | #} 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | FEED_URI = u'./jnu.csv' 91 | FEED_FORMAT = 'CSV' 92 | 93 | FEED_EXPORTERS = { 94 | 'csv': 'jnuxshc.spiders.csv_item_exporter.MyProjectCsvItemExporter', 95 | } #jnuxshc为工程名 96 | FIELDS_TO_EXPORT = [ 97 | 'time', 98 | 'title', 99 | 'intro' 100 | ] 101 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20160418/023091.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "023091", "comments": {"link": "http://coral.qq.com/1373761671"}, "date": "20160418", "contents": {"link": "https://news.qq.com/a/20160418/023091.htm", "title": ["\u5df4\u897f\u4f17\u9662\u5f39\u52be\u603b\u7edf\u6848\u83b7\u901a\u8fc7 \u7f57\u585e\u592b\u653f\u515a\u627f\u8ba4\u843d\u8d25"], "passage": "\n\n\n\n\n\n\n\n\n\r\n\r\n\r\n\r\n \r\n\r\n\r\n\u4e2d\u65b0\u7f514\u670818\u65e5\u7535 \u7efc\u5408\u5916\u5a92\u62a5\u9053\uff0c\u5df4\u897f\u4f17\u8bae\u966217\u65e5\u9488\u5bf9\u662f\u5426\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u8fdb\u884c\u8868\u51b3\uff0c\u5230\u76ee\u524d\u4e3a\u6b62\uff0c513\u540d\u8bae\u5458\u4e2d\u5df2\u6709\u81f3\u5c11342\u540d\u8bae\u5458\u5bf9\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u6295\u4e86\u8d5e\u6210\u7968\uff0c\u8fd9\u610f\u5473\u7740\u5f39\u52be\u6848\u5728\u4f17\u9662\u83b7\u5f97\u901a\u8fc7\uff0c\u5f39\u52be\u603b\u7edf\u7a0b\u5e8f\u5c06\u7ee7\u7eed\u3002\u5f39\u52be\u62a5\u544a\u5c06\u9012\u4ea4\u7ed9\u53c2\u8bae\u9662\u505a\u51fa\u6700\u7ec8\u8868\u51b3\u3002\u800c\u7f57\u585e\u592b\u6240\u5c5e\u653f\u515a\u8868\u793a\u5927\u52bf\u5df2\u53bb\uff0c\u65e0\u6cd5\u907f\u514d\u603b\u7edf\u906d\u5f39\u52be\u3002\u62a5\u9053\u79f0\uff0c\u5df4\u897f\u6267\u653f\u515a\u52b3\u5de5\u515a\u515a\u56e2\u9886\u8896\u5b63\u9a6c\u745e\u65af\u4e5f\u8868\u793a\uff0c\u5bf9\u4f17\u8bae\u9662\u5f39\u52be\u7f57\u585e\u592b\u7684\u8868\u51b3\u627f\u8ba4\u5931\u8d25\u3002\u4ed6\u5728\u4f17\u9662\u53d7\u8bbf\u8bf4\uff1a\u201c\u73b0\u5728\u8981\u5728\u53c2\u9662\u7eed\u6218\u4e86\u3002\u201d\u62a5\u9053\u6307\u51fa\uff0c\u6839\u636e\u5df4\u897f\u6cd5\u5f8b\uff0c\u4e3b\u5f20\u5f39\u52be\u4e00\u65b9\u5fc5\u987b\u5728\u6b64\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u4e09\u5206\u4e4b\u4e8c\u7684\u6295\u7968\uff0c\u5373\u5728513\u5f20\u6295\u7968\u4e2d\u4e89\u53d6\u5230342\u7968\uff0c\u624d\u80fd\u5c06\u52a8\u8bae\u63d0\u4ea4\u5230\u53c2\u8bae\u9662\uff0c\u5e76\u7531\u53c2\u8bae\u9662\u51b3\u5b9a\u603b\u7edf\u662f\u5426\u4ece\u4e8b\u4e86\u975e\u6cd5\u884c\u4e3a\u3002\u5f39\u52be\u6848\u5728\u4f17\u9662\u901a\u8fc7\u540e\uff0c\u53c2\u8bae\u9662\u5c06\u5bf9\u5176\u8fdb\u884c\u9996\u8f6e\u8868\u51b3\uff0c\u65f6\u95f4\u53ef\u80fd\u57285\u6708\u3002\u5982\u679c\u53c2\u8bae\u9662\u5728\u9996\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u7b80\u5355\u591a\u6570\u652f\u6301\uff0c\u7f57\u585e\u592b\u987b\u79bb\u804c180\u5929\uff0c\u5176\u95f4\u603b\u7edf\u4e00\u804c\u7531\u526f\u603b\u7edf\u4ee3\u7406\u3002\u53c2\u8bae\u9662\u4e4b\u540e\u5c06\u542c\u53d6\u8bc1\u636e\uff0c\u518d\u8fdb\u884c\u7b2c\u4e8c\u8f6e\u8868\u51b3\uff0c\u5982\u679c2/3\u4ee5\u4e0a\u7684\u8bae\u5458\u652f\u6301\u5f39\u52be\uff0c\u5219\u7f57\u585e\u592b\u4e0b\u53f0\uff0c\u526f\u603b\u7edf\u7279\u6885\u5c14\u63a5\u4efb\uff1b\u5982\u679c\u53c2\u8bae\u9662\u652f\u6301\u5f39\u52be\u7684\u8bae\u5458\u4e0d\u52302/3\uff0c\u7f57\u585e\u592b\u6062\u590d\u603b\u7edf\u804c\u4f4d\u3002\u62a5\u9053\u79f0\uff0c\u56e0\u4e3a\u5df4\u897f\u53c2\u8bae\u9662\u548c\u4f17\u8bae\u9662\u7684\u6784\u6210\u6781\u4e3a\u76f8\u4f3c\uff0c\u6240\u4ee5\u53c2\u8bae\u9662\u53ef\u80fd\u5f97\u51fa\u4e0e\u4f17\u8bae\u9662\u76f8\u540c\u7684\u7ed3\u8bba\u3002\u5982\u679c\u7f57\u585e\u592b\u6700\u7ec8\u88ab\u5f39\u52be\u4e0b\u53f0\uff0c\u7279\u6885\u5c14\u5c06\u63a5\u4efb\u603b\u7edf\u804c\u4f4d\uff0c\u4f46\u662f\u56e0\u4e3a\u7279\u6885\u5c14\u4e5f\u5377\u5165\u8d2a\u8150\u6848\u4ef6\u4e2d\uff0c\u7f57\u585e\u592b\u7684\u652f\u6301\u8005\u5df2\u7ecf\u5f00\u59cb\u5bf9\u4ed6\u8fdb\u884c\u5f39\u52be\u884c\u52a8\u3002\u8fd9\u4e5f\u5c31\u610f\u5473\u7740\uff0c\u5728\u4eca\u5e748\u67085\u65e5\u81f321\u65e5\u5df4\u897f\u9996\u6b21\u4e3e\u884c\u590f\u5b63\u5965\u8fd0\u4f1a\u65f6\uff0c\u5176\u653f\u5c40\u4ecd\u7136\u5728\u6df7\u4e71\u4e4b\u4e2d\u3002\u636e\u6089\uff0c\u4ece\u5df4\u897f\u5f53\u5730\u65f6\u95f44\u670815\u65e5\u65e9\u4e0a\u5f00\u59cb\u4e00\u76f4\u523017\u65e5\u6e05\u6668\uff0c\u6709120\u540d\u8bae\u5458\u53c2\u52a0\u4e86\u5173\u4e8e\u662f\u5426\u5f39\u52be\u7f57\u585e\u592b\u7684\u8fa9\u8bba\uff0c\u8fa9\u8bba\u65f6\u95f4\u8d85\u8fc743\u4e2a\u5c0f\u65f6\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/011065.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "011065", "comments": {"link": "http://coral.qq.com/1687646782"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/011065.htm", "title": ["\u5c0f\u4f19\u51fa\u5dee\u90d1\u5dde\u9047\u96fe\u973e\u8bc9\u653f\u5e9c\u88ab\u9a73\uff1a\u5e94\u5148\u7533\u8bf7\u653f\u5e9c\u8d54\u507f\u53e3\u7f69\u94b1"], "passage": "\u56e0\u51fa\u5dee\u90d1\u5dde\u53d1\u73b0\u5f53\u5730\u96fe\u973e\u4e25\u91cd\uff0c\u65e5\u524d\uff0c\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u9a73\u56de\u539f\u544a\u8d77\u8bc9\uff0c\u7406\u7531\u662f\u5176\u8d77\u8bc9\u524d\u5e76\u672a\u5411\u90d1\u5dde\u5e02\u653f\u5e9c\u63d0\u51fa\u8fc7\u8d54\u507f\u7533\u8bf7\u3002\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u884c\u653f\u8d54\u507f\u88c1\u5b9a\u4e66\u3002\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\u83b7\u5f97\u7684\u88c1\u5b9a\u4e66\u663e\u793a\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u4f9d\u636e\u300a\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u8d54\u507f\u6cd5\u300b\u7b2c\u4e5d\u6761\u7b2c\u4e8c\u6b3e\u89c4\u5b9a\uff0c\u6b64\u5916\uff0c\u4f9d\u636e\u300a\u6700\u9ad8\u4eba\u6c11\u6cd5\u9662\u5173\u4e8e\u5ba1\u7406\u884c\u653f\u8d54\u507f\u6848\u4ef6\u82e5\u5e72\u95ee\u9898\u7684\u89c4\u5b9a\u300b\u7b2c\u56db\u6761\u7b2c\u4e8c\u6b3e\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u539f\u544a\u5728\u63d0\u8d77\u8bc9\u8bbc\u524d\uff0c\u5176\u8d54\u507f\u8bf7\u6c42\u5c1a\u672a\u7ecf\u8fc7\u90d1\u5dde\u5e02\u4eba\u6c11\u653f\u5e9c\u5148\u884c\u5904\u7406\u3002\u56e0\u6b64\uff0c\u6cd5\u9662\u5e94\u5f53\u9a73\u56de\u539f\u544a\u8d77\u8bc9\u300212\u670826\u65e5\u665a\uff0c\u5b59\u6d2a\u5f6c\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u4ed6\u4e8e26\u65e5\u5f53\u5929\u6536\u5230\u4e86\u8be5\u88c1\u5b9a\u4e66\u3002\u5b59\u6d2a\u5f6c\u8bf4\uff0c\u8be5\u88c1\u5b9a\u5728\u4ed6\u7684\u610f\u6599\u4e4b\u4e2d\u3002\u201c\u73b0\u5728\u8fd8\u6ca1\u51b3\u5b9a\u8981\u4e0d\u8981\u4e0a\u8bc9\uff0c\u4f46\u662f\u54a8\u8be2\u4e86\u5f8b\u5e08\u4e5f\u8bf4\u4e0a\u8bc9\u4e5f\u6ca1\u6709\u610f\u4e49\uff0c\u4f30\u8ba1\u4e0d\u4f1a\u7ee7\u7eed\uff08\u4e0a\u8bc9\uff09\u4e86\u201d\u3002\u6f8e\u6e43\u65b0\u95fb\u6ce8\u610f\u5230\uff0c\u300a\u8d54\u507f\u6cd5\u300b\u89c4\u5b9a\uff0c\u8d54\u507f\u4e49\u52a1\u673a\u5173\u53ef\u4ee5\u5728\u4e24\u4e2a\u6708\u5185\u505a\u51fa\u662f\u5426\u8d54\u507f\u7684\u51b3\u5b9a\u3002\u4ed6\u8bf4\uff0c\u5728\u5411\u5e02\u653f\u5e9c\u63d0\u51fa\u8d54\u507f\u7533\u8bf7\u4e4b\u540e\uff0c\u81ea\u5df1\u53c8\u5411\u6cd5\u9662\u9012\u4ea4\u4e86\u53e6\u5916\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u8981\u6c42\u786e\u8ba4\u90d1\u5dde\u5e02\u653f\u5e9c\u6cbb\u973e\u4e0d\u4f5c\u4e3a\uff0c\u672a\u4e25\u683c\u5c65\u884c\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u5b9a\u804c\u8d23\uff0c\u201c\u73b0\u5728\u4e3b\u8981\u770b\u8fd9\u4e2a\u8bc9\u8bbc\u80fd\u5426\u7acb\u6848\u4e86\u3002\u201d\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c11\u670820\u65e5\uff0c\u5b59\u6d2a\u5f6c\u5728\u90d1\u5dde\u51fa\u5dee\u65f6\uff0c\u5728\u8be5\u5e02\u5730\u6807\u5efa\u7b51\u4e8c\u4e03\u5854\u9644\u8fd1\u611f\u89c9\u201c\u7279\u522b\u545b\u201d\uff0c\u4ed6\u4fbf\u4e70\u4e86\u4e00\u526f\u4ef7\u503c32\u5143\u7684\u9632\u973e\u53e3\u7f69\u3002\u5f53\u5929\u90d1\u5dde\u5e02AQI\u4e3a253\uff0c\u5c5e\u4e8e\u91cd\u5ea6\u6c61\u67d3\u3002\u5f53\u665a\uff0c\u5b59\u6d2a\u5f6c\u62df\u51fa\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u79f0\u4f9d\u636e\u300a\u73af\u5883\u4fdd\u62a4\u6cd5\u300b\u53ca\u300a\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u300b\u89c4\u5b9a\uff0c\u90d1\u5dde\u5e02\u653f\u5e9c\u5e94\u5bf9\u672c\u884c\u653f\u533a\u57df\u7684\u73af\u5883\u8d28\u91cf\u8d1f\u8d23\u3002\u5b59\u6d2a\u5f6c\u8bf7\u6c42\u4f9d\u6cd5\u5224\u4ee4\u88ab\u544a\u8d54\u507f\u572811\u670820\u65e5\u90d1\u5dde\u96fe\u973e\u671f\u95f4\u7684\u53e3\u7f69\u8d2d\u4e70\u8d39\u7528\uff0c\u5e76\u5224\u4ee4\u88ab\u544a\u627f\u62c5\u672c\u6848\u8bc9\u8bbc\u8d39\u3002\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u66fe\u4e8e11\u670825\u65e5\u7ec4\u6210\u4e86\u5408\u8bae\u5ead\uff0c\u53d7\u7406\u6b64\u6848\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CrawlDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JnuxshcSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class JnuxshcDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /机器学习入门/label_propagation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 11:28 4 | # @Author : mazicwong 5 | # @File : label_propagation.py 6 | 7 | import time 8 | import numpy as np 9 | 10 | 11 | # return k neighbors index 12 | def navie_knn(dataSet, query, k): 13 | numSamples = dataSet.shape[0] 14 | 15 | ## step 1: calculate Euclidean distance 16 | diff = np.tile(query, (numSamples, 1)) - dataSet 17 | squaredDiff = diff ** 2 18 | squaredDist = np.sum(squaredDiff, axis=1) # sum is performed by row 19 | 20 | ## step 2: sort the distance 21 | sortedDistIndices = np.argsort(squaredDist) 22 | if k > len(sortedDistIndices): 23 | k = len(sortedDistIndices) 24 | 25 | return sortedDistIndices[0:k] 26 | 27 | 28 | # build a big graph (normalized weight matrix) 29 | def buildGraph(MatX, kernel_type, rbf_sigma=None, knn_num_neighbors=None): 30 | num_samples = MatX.shape[0] 31 | affinity_matrix = np.zeros((num_samples, num_samples), np.float32) 32 | if kernel_type == 'rbf': 33 | if rbf_sigma == None: 34 | raise ValueError('You should input a sigma of rbf kernel!') 35 | for i in range(num_samples): 36 | row_sum = 0.0 37 | for j in range(num_samples): 38 | diff = MatX[i, :] - MatX[j, :] 39 | affinity_matrix[i][j] = np.exp(sum(diff ** 2) / (-2.0 * rbf_sigma ** 2)) 40 | row_sum += affinity_matrix[i][j] 41 | affinity_matrix[i][:] /= row_sum 42 | elif kernel_type == 'knn': 43 | if knn_num_neighbors == None: 44 | raise ValueError('You should input a k of knn kernel!') 45 | for i in range(num_samples): 46 | k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors) 47 | affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors 48 | else: 49 | raise NameError('Not support kernel type! You can use knn or rbf!') 50 | 51 | return affinity_matrix 52 | 53 | 54 | # label propagation 55 | def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='rbf', rbf_sigma=1.5, \ 56 | knn_num_neighbors=10, max_iter=500, tol=1e-3): 57 | # initialize 58 | num_label_samples = Mat_Label.shape[0] 59 | num_unlabel_samples = Mat_Unlabel.shape[0] 60 | num_samples = num_label_samples + num_unlabel_samples 61 | labels_list = np.unique(labels) 62 | num_classes = len(labels_list) 63 | 64 | MatX = np.vstack((Mat_Label, Mat_Unlabel)) 65 | clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32) 66 | for i in range(num_label_samples): 67 | clamp_data_label[i][labels[i]] = 1.0 68 | 69 | label_function = np.zeros((num_samples, num_classes), np.float32) 70 | label_function[0: num_label_samples] = clamp_data_label 71 | label_function[num_label_samples: num_samples] = -1 72 | 73 | # graph construction 74 | affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors) 75 | 76 | # start to propagation 77 | iter = 0; 78 | pre_label_function = np.zeros((num_samples, num_classes), np.float32) 79 | changed = np.abs(pre_label_function - label_function).sum() 80 | while iter < max_iter and changed > tol: 81 | if iter % 1 == 0: 82 | print 83 | "---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed) 84 | pre_label_function = label_function 85 | iter += 1 86 | 87 | # propagation 88 | label_function = np.dot(affinity_matrix, label_function) 89 | 90 | # clamp 91 | label_function[0: num_label_samples] = clamp_data_label 92 | 93 | # check converge 94 | changed = np.abs(pre_label_function - label_function).sum() 95 | 96 | # get terminate label of unlabeled data 97 | unlabel_data_labels = np.zeros(num_unlabel_samples) 98 | for i in range(num_unlabel_samples): 99 | unlabel_data_labels[i] = np.argmax(label_function[i + num_label_samples]) 100 | 101 | return unlabel_data_labels 102 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TutotialSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TutotialDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20171129/013590.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "013590", "comments": {"link": "http://coral.qq.com/2259249504"}, "date": "20171129", "contents": {"link": "https://news.qq.com/a/20171129/013590.htm", "title": ["\u8054\u901a\u7545\u6e38\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u8bed\u97f3\u3001\u6d41\u91cf\u5168\u56fd\u7545\u723d\u4f7f\u7528"], "passage": "\u4e2d\u56fd\u8054\u901a\u6b63\u5f0f\u63a8\u51fa\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u542b\u8d85\u5927\u6d41\u91cf\u3001\u8d85\u591a\u8bed\u97f3\uff0c\u53ef\u5728\u5168\u56fd\u8303\u56f4\u5185\u7545\u723d\u4f7f\u7528\u3002\u4e0d\u9650\u6d41\u91cf\u3001\u4e0d\u9650\u8bed\u97f3\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u5c06\u7ed9\u7528\u6237\u5e26\u6765\u590f\u5929\u4eab\u53d7\u51b0\u6fc0\u51cc\u4e00\u6837\u7684\u7545\u723d\u611f\uff0c\u662f\u4e00\u6b3e\u5f70\u663e\u8054\u901a\u4e2a\u6027\u7684\u4ea7\u54c1\uff0c\u4f5c\u4e3a\u4e1a\u754c\u9996\u521b\uff0c\u51b0\u6fc0\u51cc\u5957\u9910\u4ea7\u54c1\u663e\u793a\u4e86\u4e2d\u56fd\u8054\u901a\u4e00\u76f4\u4ee5\u6765\u4fdd\u6301\u7740\u7684\u6d3b\u529b\u4e0e\u521b\u65b0\u3002\u6d41\u91cf\u4e0d\u9650\u91cf\uff0c\u7d27\u8ddf\u5f53\u4e0b\u5e74\u8f7b\u4eba\u7231\u8ffd\u5267\u3001\u7231\u76f4\u64ad\u7684\u6d88\u8d39\u4e60\u60ef\uff0c\u540c\u65f6\u6ee1\u8db3\u7ecf\u5e38\u51fa\u5dee\u3001\u65c5\u6e38\u7b49\u5546\u65c5\u4eba\u58eb\u7684\u9700\u6c42\u3002\u901a\u8bdd\u4e0d\u9650\u91cf\uff0c\u8ba9\u7528\u6237\u4e0e\u5bb6\u4eba\u670b\u53cb\u8fdb\u884c\u901a\u8bdd\u65f6\uff0c\u4e0d\u518d\u957f\u8bdd\u77ed\u8bf4\uff0c\u5b9e\u73b0\u771f\u6b63\u610f\u4e49\u4e0a\u7684\u7545\u723d\u804a\u5929\uff0c\u5168\u56fd\u901a\u7528\uff0c\u65e0\u6f2b\u6e38\u3001\u957f\u9014\u8d39\u7528\u4ea7\u751f\u3002\u73b0\u767b\u5f55\u8054\u901a\u7f51\u4e0a\u8425\u4e1a\u5385\uff0c\u5373\u53ef\u9996\u670899\u5143\u4eab\u53d7\u4e0d\u9650\u91cf\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff1b\u9884\u5b5899\u5143\u9001100\u5143\uff0c\u6708\u8d39\u6c38\u4e455\u6298\uff08\u539f\u4ef7398\uff0c\u73b0\u4ec5\u9700\u6708\u8d39199\uff09\uff1b\u4ec5\u9650\u8054\u901a\u7f51\u4e0a\u5546\u57ce\u529e\u7406\u7528\u6237\u3002\u751f\u65e5\u53f7\u3001\u60c5\u4fa3\u53f7\u7b49\u968f\u610f\u9009\uff0c\u8ba9\u4f60\u7684\u624b\u673a\u53f7\u4e0d\u518d\u662f\u51b7\u51b0\u51b0\u7684\u4e00\u7ec4\u6570\u5b57\u3002\u4e2d\u56fd\u8054\u901a\u4ee5\u7528\u6237\u5229\u76ca\u4e3a\u6838\u5fc3\uff0c\u5df2\u5b8c\u6210\u4e00\u7cfb\u5217\u521b\u65b0\u52a8\u4f5c\uff0c\u6b64\u524d\uff0c\u8054\u5408\u4e92\u8054\u7f51\u516c\u53f8\u63a8\u51fa\u4e86\u8682\u8681\u5b9d\u5361\u3001\u817e\u8baf\u738b\u5361\u7b49\u521b\u65b0\u4ea7\u54c1\uff0c\u6b64\u6b21\uff0c\u63a8\u51fa\u7684\u5168\u56fd\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c \u4e5f\u662f\u54cd\u5e94\u56fd\u5bb6\u63d0\u901f\u964d\u8d39\u653f\u7b56\uff0c\u8df5\u884c\u201c\u6d41\u91cf\u653e\u5fc3\u7528\u201d\u7684\u53c8\u4e00\u529b\u4e3e\u3002\u672a\u6765\uff0c\u4e2d\u56fd\u8054\u901a\u5c06\u628a\u51b0\u6fc0\u51cc\u5957\u9910\u4f5c\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u6807\u6746\uff0c\u4ee5\u96f6\u6346\u7ed1\u3001\u6d41\u91cf\u8d85\u591a\u3001\u64cd\u4f5c\u7b80\u5355\u3001\u65b9\u4fbf\u7528\u6237\u4f7f\u7528\u7b49\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u539f\u5219\uff0c\u63a8\u51fa\u66f4\u591a\u201c\u7c7b\u51b0\u6fc0\u51cc\u5957\u9910\u201d\u4ea7\u54c1\uff0c\u5728\u8bed\u97f3\u3001\u6d41\u91cf\u4eab\u53d7\u8d85\u7ea7\u989d\u5ea6\u7684\u57fa\u7840\u4e0a\uff0c\u5b9e\u73b0\u7ec8\u7aef\u5957\u9910\u4e0d\u6346\u7ed1\u3001\u6863\u4f4d\u968f\u610f\u66f4\u6362\u3001\u5957\u9910\u6863\u4f4d\u7cbe\u7b80\u3001\u65b0\u8001\u7528\u6237\u4f18\u60e0\u540c\u4eab\u7b49\u7279\u70b9\u7684\u4ea7\u54c1\u4f18\u5316\uff0c\u4e3a\u7528\u6237\u5e26\u6765\u66f4\u52a0\u653e\u5fc3\u7684\u4f7f\u7528\u4f53\u9a8c\uff0c\u5e76\u4ece\u591a\u4e2a\u5c42\u9762\u4e30\u5bcc\u8054\u901a\u201c\u6c834G+\u201d\u6781\u901f\u7f51\u7edc\u7684\u5320\u5fc3\u610f\u4e49\u3002\u51b0\u6fc0\u51cc\u5957\u9910\u5df2\u5728\u5168\u56fd\u8303\u56f4\u5185\u9646\u7eed\u4e0a\u5e02\u53d1\u552e\uff0c\u8be6\u8be210010\u6216\u54a8\u8be2\u5f53\u5730\u8425\u4e1a\u5385\u3002http://www.10010.com/goodsdetail/111711031180.html\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u817e\u8baf\u7f51\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HIR5JP0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HIR5JP0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HIR5JP0001875P.html"}, "newsId": "D8HIR5JP0001875P", "contents": {"passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u642d\u8baa\u5973\u751f\u79f0\u5176\u53ef\u5b89\u6392\u5de5\u4f5c \u804c\u6821\u5973\u5b69\u88ab\u9a974000\u5143\uff09\n

\"\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u81ea\u79f0\u53ef\u5b89\u6392\u5de5\u4f5c

\u5c01\u9762\u65b0\u95fb\u8baf 1\u670818\u65e5\uff0c\u7ef5\u9633\u67d0\u804c\u682117\u5c81\u5973\u751f\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u7537\u5b50\u642d\u8baa\u81ea\u79f0\u662f\u6559\u80b2\u5c40\u526f\u5c40\u957f\uff0c\u53ef\u4ee5\u4e3a\u5176\u5b89\u6392\u5de5\u4f5c\uff0c\u9a97\u5f97\u5973\u5b69\u4fe1\u4efb\u3002\u4ea4\u8c08\u540e\u8be5\u7537\u5b50\u9a6c\u4e0a\u53c2\u52a0\u8001\u5c40\u957f\u751f\u65e5\u5bb4\uff0c\u9a97\u5f97\u5973\u5b694000\u5143\u751f\u6d3b\u8d39\u3002

18\u65e5\u4e0b\u53483\u70b9\uff0c\u7279\u5de1\u8b66\u652f\u961f\u5de1\u903b\u4e00\u5927\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff0c\u5728\u9752\u5e74\u5e7f\u573a\u6709\u4e00\u5973\u5b69\u88ab\u9a97\u3002\u6c11\u8b66\u8d76\u5230\u73b0\u573a\u4e86\u89e3\u5230\u5973\u5b69\u59d3\u656c\uff0c\u4eca\u5e7417\u5c81\uff0c\u7ef5\u9633\u67d0\u804c\u6821\u5b66\u751f\uff0c\u5973\u5b69\u54ed\u8bc9\u5979\u88ab\u4e00\u4e2a\u81ea\u79f0\u6559\u80b2\u5c40\u526f\u5c40\u957f\u7684\u9a97\u5b50\u9a97\u8d70\u4e864000\u5143\u3002

\u201c\u4eca\u5929\u4e0b\u5348\u5979\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u4e2d\u5e74\u7537\u5b50\u548c\u5979\u642d\u8baa\uff0c\u8bf4\u5979\u5f88\u50cf\u540c\u4e8b\u7684\u5973\u513f\uff0c\u8fd8\u8868\u626c\u5979\u957f\u5f97\u6f02\u4eae\uff0c\u7537\u5b50\u53c8\u95ee\u5c0f\u656c\u591a\u5927\u4e86\uff0c\u662f\u5b66\u751f\u5417\uff1f\u5728\u90a3\u4e2a\u5b66\u6821\u4e0a\u5b66\uff1f\u201d\u5c0f\u656c\u544a\u8bc9\u8b66\u65b9\uff0c\u5979\u6ca1\u6709\u9632\u5907\uff0c\u90fd\u4e00\u4e00\u56de\u7b54\uff0c\u63a5\u7740\u7537\u5b50\u8bf4\u81ea\u5df1\u662f\u6559\u80b2\u5c40\u7684\u674e\u526f\u5c40\u957f\uff0c\u7b49\u5c0f\u656c\u6bd5\u4e1a\u4e86\u53ef\u4ee5\u5e2e\u52a9\u5979\u5b89\u6392\u5de5\u4f5c\u3002

\n

\u542c\u8bf4\u53ef\u4ee5\u5b89\u6392\u5de5\u4f5c\uff0c\u5c0f\u656c\u89c9\u5f97\u81ea\u5df1\u9047\u5230\u8d35\u4eba\u4e86\uff0c\u5f7c\u6b64\u76f8\u8c08\u751a\u6b22\u3002\u6b64\u65f6\u8fd9\u540d\u674e\u526f\u5c40\u957f\u8bf4\uff0c\u4ed6\u4e0a\u5348\u521a\u5f00\u5b8c\u4f1a\u8fd9\u4f1a\u8981\u53bb\u53c2\u52a0\u8001\u5c40\u957f\u7684\u751f\u65e5\u5bb4\uff0c\u7531\u4e8e\u6ca1\u5e26\u5361\u6ca1\u6cd5\u53d6\u94b1\uff0c\u8bf7\u5c0f\u656c\u5e2e\u4ed6\u5148\u62ff\u70b9\u94b1\u3002\u201c\u4ed6\u95ee\u6211\u6709\u591a\u5c11\u94b1\uff0c\u6b63\u597d\u8eab\u4e0a\u67094000\u5143\u751f\u6d3b\u8d39\u3002\u201d\u6beb\u65e0\u9632\u5907\u7684\u5973\u5b69\u76f8\u4fe1\u4e86\u526f\u5c40\u957f\u6682\u65f6\u501f\u7528\u4f1a\u8fd8\u94b1\u7684\u8bf4\u6cd5\uff0c\u5c064000\u5143\u94b1\u5168\u90e8\u62ff\u7ed9\u4e86\u4ed6\uff0c\u770b\u7740\u5f88\u5feb\u6d88\u5931\u5728\u4eba\u7fa4\u4e2d\u7684\u526f\u5c40\u957f\uff0c\u5c0f\u656c\u624d\u5f00\u59cb\u6000\u7591\uff0c\u8d8a\u60f3\u8d8a\u4e0d\u5bf9\u52b2\uff0c\u4e8e\u662f\u7acb\u5373\u62a5\u8b66\uff0c\u76ee\u524d\u8b66\u65b9\u5df2\u5c55\u5f00\u8fdb\u4e00\u6b65\u8c03\u67e5\u3002

\u8b66\u65b9\u63d0\u9192\u5e02\u6c11\uff0c\u9a97\u5b50\u4f1a\u7279\u610f\u7784\u51c6\u90a3\u4e9b\u6d89\u4e8b\u4e0d\u6df1\uff0c\u5584\u826f\u7684\u5c0f\u5973\u5b69\u884c\u9a97\uff0c\u5b66\u6821\u548c\u5bb6\u957f\u8981\u591a\u52a0\u5f3a\u8fd9\u65b9\u9762\u7684\u6559\u80b2\uff0c\u5c0f\u5b69\u81ea\u5df1\u4e5f\u6700\u597d\u4e0d\u8981\u56de\u5e94\u964c\u751f\u4eba\u4e3b\u52a8\u642d\u8baa\u3002

", "link": "http://news.163.com/18/0119/18/D8HIR5JP0001875P.html", "title": ["\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u81ea\u79f0\u53ef\u5b89\u6392\u5de5\u4f5c \u5973\u5b69\u88ab\u9a974000\u5143"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180116/D897H80K0001899O.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D897H80K0001899O", "date": "20180116", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D897H80K0001899O.html"}, "contents": {"title": ["\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd"], "link": "http://news.163.com/18/0116/12/D897H80K0001899O.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\uff09\n

\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670816\u65e5\u6d88\u606f\uff0c\u56fd\u5bb6\u4e3b\u5e2d\u4e60\u8fd1\u5e7316\u65e5\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\u3002

\u4e60\u8fd1\u5e73\u6307\u51fa\uff0c\u8fc7\u53bb\u7684\u4e00\u5e74\uff0c\u4e2d\u7f8e\u5173\u7cfb\u603b\u4f53\u4fdd\u6301\u7a33\u5b9a\u5e76\u53d6\u5f97\u91cd\u8981\u8fdb\u5c55\u3002\u4fdd\u6301\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\uff0c\u7b26\u5408\u4e24\u56fd\u548c\u4e24\u56fd\u4eba\u6c11\u5229\u76ca\uff0c\u4e5f\u662f\u56fd\u9645\u793e\u4f1a\u5171\u540c\u671f\u5f85\u3002\u53cc\u65b9\u8981\u4fdd\u6301\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u5145\u5206\u53d1\u63254\u4e2a\u9ad8\u7ea7\u522b\u5bf9\u8bdd\u673a\u5236\u4f5c\u7528\u5e76\u9002\u65f6\u4e3e\u529e\u7b2c\u4e8c\u8f6e\u5bf9\u8bdd\u3002\u4e2d\u7f8e\u7ecf\u8d38\u5408\u4f5c\u7ed9\u4e24\u56fd\u4eba\u6c11\u5e26\u6765\u8bb8\u591a\u5b9e\u5b9e\u5728\u5728\u7684\u5229\u76ca\u3002\u53cc\u65b9\u5e94\u8be5\u91c7\u53d6\u5efa\u8bbe\u6027\u65b9\u5f0f\uff0c\u901a\u8fc7\u5bf9\u5f7c\u6b64\u5f00\u653e\u5e02\u573a\u3001\u505a\u5927\u5408\u4f5c\u86cb\u7cd5\uff0c\u59a5\u5584\u89e3\u51b3\u53cc\u65b9\u5173\u5207\u7684\u7ecf\u8d38\u95ee\u9898\u3002\u8981\u79ef\u6781\u63a8\u8fdb\u4e24\u519b\u3001\u6267\u6cd5\u3001\u7981\u6bd2\u3001\u4eba\u6587\u3001\u5730\u65b9\u7b49\u5408\u4f5c\uff0c\u5c31\u91cd\u5927\u56fd\u9645\u548c\u5730\u533a\u95ee\u9898\u4fdd\u6301\u5bc6\u5207\u6c9f\u901a\u534f\u5546\u3002\u53cc\u65b9\u8981\u76f8\u5411\u800c\u884c\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u805a\u7126\u5408\u4f5c\uff0c\u4ee5\u5efa\u8bbe\u6027\u65b9\u5f0f\u5904\u7406\u654f\u611f\u95ee\u9898\uff0c\u5c0a\u91cd\u5f7c\u6b64\u6838\u5fc3\u5229\u76ca\u548c\u91cd\u5927\u5173\u5207\uff0c\u7ef4\u62a4\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\u52bf\u5934\u3002

\n

\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u9ad8\u5ea6\u91cd\u89c6\u5bf9\u534e\u5173\u7cfb\u548c\u7f8e\u4e2d\u5408\u4f5c\uff0c\u613f\u540c\u4e2d\u65b9\u4e00\u9053\uff0c\u52a0\u5f3a\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u62d3\u5c55\u52a1\u5b9e\u9886\u57df\u5408\u4f5c\uff0c\u5904\u7406\u597d\u4e24\u56fd\u7ecf\u8d38\u4e2d\u7684\u95ee\u9898\uff0c\u63a8\u52a8\u53cc\u8fb9\u5173\u7cfb\u53d6\u5f97\u66f4\u5927\u53d1\u5c55\u3002

\u4e60\u8fd1\u5e73\u5e94\u8be2\u4ecb\u7ecd\u4e86\u5bf9\u5f53\u524d\u671d\u9c9c\u534a\u5c9b\u5c40\u52bf\u7684\u770b\u6cd5\uff0c\u6307\u51fa\u671d\u9c9c\u534a\u5c9b\u5f62\u52bf\u51fa\u73b0\u4e00\u4e9b\u79ef\u6781\u53d8\u5316\u3002\u5404\u65b9\u5e94\u8be5\u5171\u540c\u52aa\u529b\u628a\u6765\u4e4b\u4e0d\u6613\u7684\u7f13\u548c\u52bf\u5934\u5ef6\u7eed\u4e0b\u53bb\uff0c\u4e3a\u91cd\u542f\u5bf9\u8bdd\u8c08\u5224\u521b\u9020\u6761\u4ef6\u3002\u5b9e\u73b0\u671d\u9c9c\u534a\u5c9b\u65e0\u6838\u5316\uff0c\u7ef4\u62a4\u671d\u9c9c\u534a\u5c9b\u548c\u5e73\u7a33\u5b9a\u7b26\u5408\u5404\u65b9\u5171\u540c\u5229\u76ca\uff0c\u7ef4\u62a4\u56fd\u9645\u793e\u4f1a\u5728\u8fd9\u4e2a\u95ee\u9898\u4e0a\u7684\u56e2\u7ed3\u5341\u5206\u91cd\u8981\u3002\u4e2d\u65b9\u613f\u7ee7\u7eed\u540c\u5305\u62ec\u7f8e\u65b9\u5728\u5185\u7684\u56fd\u9645\u793e\u4f1a\u4e00\u9053\uff0c\u5bc6\u5207\u6c9f\u901a\u3001\u76f8\u4e92\u4fe1\u4efb\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u52a0\u5f3a\u5408\u4f5c\uff0c\u63a8\u52a8\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u671d\u7740\u59a5\u5584\u89e3\u51b3\u7684\u65b9\u5411\u4e0d\u65ad\u53d6\u5f97\u8fdb\u5c55\u3002

\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u91cd\u89c6\u4e2d\u65b9\u5728\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u4e0a\u7684\u91cd\u8981\u4f5c\u7528\uff0c\u613f\u7ee7\u7eed\u52a0\u5f3a\u540c\u4e2d\u65b9\u7684\u6c9f\u901a\u534f\u8c03\u3002

"}, "cmtId": "D897H80K0001899O"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/006769.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "006769", "comments": {"link": "http://coral.qq.com/2369397397"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006769.htm", "title": ["\u7fa4\u4f17\u53cd\u6620\u996e\u6c34\u95ee\u9898\u88ab\u603c\u201c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d \u5f53\u4e8b\u793e\u533a\u4e66\u8bb0\u88ab\u514d\u804c"], "passage": "\u5468\u65ed \u622a\u5c4f\u56fe2018\u5e741\u670819\u65e5\u665a8\u65f6\u8bb8\uff0c\u6210\u90fd\u5e02\u6e29\u6c5f\u533a\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u5b98\u65b9\u5fae\u535a\u53d1\u5e03\u6d88\u606f\u79f0\uff1a\u7ecf\u6838\u5b9e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u7fa4\u4f17\u8fc7\u7a0b\u4e2d\u6001\u5ea6\u751f\u786c\uff0c\u8a00\u8bed\u4e0d\u5f53\uff0c\u9020\u6210\u8d1f\u9762\u5f71\u54cd\uff0c\u6709\u635f\u57fa\u5c42\u515a\u5458\u5e72\u90e8\u5f62\u8c61\u30021\u670819\u65e5\uff0c\u7ecf\u9547\u515a\u59d4\u7814\u7a76\uff0c\u51b3\u5b9a\u514d\u53bb\u5468\u65ed\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u804c\u52a1\u3002\u4e00\u6bb5\u88ab\u66dd\u5149\u7684\u89c6\u9891\u663e\u793a\uff0c\u8fd1\u65e5\uff0c\u5728\u6210\u90fd\u5e02\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u503c\u73ed\u5ba4\u5185\uff0c\u6709\u7fa4\u4f17\u53cd\u6620\u996e\u7528\u6c34\u76f8\u5173\u95ee\u9898\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u8fc7\u7a0b\u4e2d\u5bf9\u7fa4\u4f17\u79f0\uff0c\u201c\u4e3a\u4eba\u6c11\u670d\u52a1\u4e0d\u662f\u4e3a\u516c\u6c11\u670d\u52a1\uff0c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d\uff0c\u5f15\u53d1\u5e7f\u6cdb\u8206\u8bba\u5173\u6ce8\u3002\u89c6\u9891\u4e2d\uff0c\u5468\u65ed\u75285\u5206\u949f\u7ed9\u6765\u8bbf\u7fa4\u4f17\u8bb2\u89e3\u201c\u516c\u6c11\u201d\u4e0e\u201c\u4eba\u6c11\u201d\u7684\u533a\u522b\uff0c\u4e0d\u65f6\u7fd8\u7740\u4e8c\u90ce\u817f\uff0c\u6001\u5ea6\u968f\u610f\uff0c\u5e76\u79f0\u201c\u4f60\u76d1\u7763\u4e0d\u5230\u6211\u201d\u30021\u670819\u65e5\u665a\uff0c\u5f53\u4e8b\u4eba\u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u7531\u4e8e\u5de5\u7a0b\u65bd\u5de5\uff0c\u5979\u6240\u5c45\u4f4f\u7684\u5730\u65b9\u51e0\u5e74\u524d\u5730\u4e0b\u6c34\u67af\u7aed\uff0c\u540e\u7531\u793e\u533a\u534f\u8c03\u9001\u6c34\u89e3\u51b3\u65e5\u5e38\u7528\u6c34\u30022017\u5e7412\u670831\u65e5\uff0c\u5979\u8ba1\u5212\u5f53\u65e5\u5728\u5bb6\u4e3e\u529e\u751f\u65e5\u5bb4\u5e2d\uff0c\u5e76\u63d0\u524d\u4e24\u5929\u5411\u793e\u533a\u63d0\u51fa\u7528\u6c34\u7533\u8bf7\uff0c\u4f4612\u670830\u65e5\u4e2d\u5348\uff0c\u996e\u6c34\u4ecd\u6ca1\u6709\u9001\u5230\u3002\u201c31\u53f7\u65e9\u4e0a5\u70b9\u53a8\u5e08\u5c31\u8981\u8fc7\u6765\uff0c\u6ca1\u529e\u6cd5\u53ea\u80fd\u53c8\u8dd1\u8fc7\u53bb\u53cd\u6620\u60c5\u51b5\u3002\u201d\u9648\u5973\u58eb\u8bf4\uff0c\u5979\u548c\u5bb6\u4eba\u5148\u5230\u5929\u738b\u793e\u533a\uff0c\u540e\u53c8\u5230\u6e29\u6c5f\u533a\u653f\u5e9c\uff0c\u4e00\u76f4\u7b49\u523031\u65e5\u51cc\u6668\uff0c\u88ab\u544a\u77e5\u793e\u533a\u5c06\u5b89\u6392\u4eba\u5904\u7406\uff0c\u8ba9\u5979\u4eec\u5230\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u53bb\u7b49\u5f85\u3002\u5230\u8fbe\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u540e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u63a5\u5f85\u4e86\u5979\u4eec\u3002\u201c\u4e0d\u662f\u6765\u89e3\u51b3\u95ee\u9898\uff0c\u4e00\u5f00\u59cb\u5c31\u7ed9\u6211\u4eec\u2018\u666e\u6cd5\u2019\uff0c\u8bf4\u6211\u4eec\u4e0d\u662f\u4eba\u6c11\u3002\u201d\u9648\u5973\u58eb\u79f0\uff0c\u996e\u6c34\u6700\u7ec8\u6ca1\u6709\u9001\u6765\uff0c\u5979\u53ea\u597d\u8ba9\u4eb2\u4eba\u5e2e\u5fd9\u81ea\u5df1\u8fd0\u6c34\u8fc7\u6765\uff0c\u53c8\u4e70\u4e86\u4e9b\u6876\u88c5\u6c34\u56de\u6765\u3002\u56e0\u4e3a\u6c34\u4e0d\u591f\u7528\uff0c\u539f\u8ba1\u5212\u8bf7\u5ba220\u684c\uff0c\u6700\u540e\u53ea\u529e\u4e8613\u684c\u3002\u89c6\u9891\u66dd\u5149\u540e\uff0c\u5f15\u53d1\u5e7f\u6cdb\u70ed\u8bae\u3002 \u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u5f53\u5730\u653f\u5e9c\u4e5f\u76f8\u5f53\u91cd\u89c6\uff0c19\u65e5\u4e0b\u53482\u65f6\u8bb8\uff0c\u6e29\u6c5f\u533a\u7eaa\u59d4\u76d1\u5bdf\u5c40\u7684\u5de5\u4f5c\u4eba\u5458\u8054\u7cfb\u5979\uff0c\u5c31\u4e8b\u60c5\u7684\u7ecf\u8fc7\u8fdb\u884c\u4e86\u8be2\u95ee\uff0c\u5e76\u505a\u4e86\u7b14\u5f55\uff0c\u8be2\u95ee\u6301\u7eed\u4e863\u4e2a\u591a\u5c0f\u65f6\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/010301.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "010301", "comments": {"link": "http://coral.qq.com/2369810132"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010301.htm", "title": ["\u7f8e\u8230\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u9ec4\u5ca9\u5c9b\u9644\u8fd1\u9886\u6d77 \u5916\u4ea4\u90e8\u3001\u56fd\u9632\u90e8\u5f3a\u786c\u8868\u6001"], "passage": "\u65b0\u534e\u793e\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff0c\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u6177\u5f53\u65e5\u5c31\u7f8e\u56fd\u4e00\u8258\u5bfc\u5f39\u9a71\u9010\u8230\u8fdb\u5165\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u7b54\u8bb0\u8005\u95ee\u65f6\u8868\u793a\uff0c\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u6709\u8bb0\u8005\u95ee\uff1a\u636e\u4e86\u89e3\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u4ece\u9ec4\u5ca9\u5c9b\u897f\u5357\u4fa7\u8fdb\u5165\u8be5\u5c9b12\u6d77\u91cc\u8303\u56f4\u3002\u4e2d\u65b9\u5bf9\u6b64\u6709\u4f55\u8bc4\u8bba\uff1f\u9646\u6177\u8bf4\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u672a\u7ecf\u4e2d\u56fd\u653f\u5e9c\u5141\u8bb8\uff0c\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u3002\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u9646\u6177\u8868\u793a\uff0c\u7f8e\u65b9\u519b\u8230\u6709\u5173\u884c\u4e3a\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\uff0c\u5bf9\u4e2d\u65b9\u5728\u6709\u5173\u6d77\u57df\u5f00\u5c55\u6b63\u5e38\u516c\u52a1\u6d3b\u52a8\u7684\u8239\u53ea\u548c\u4eba\u5458\u5b89\u5168\u9020\u6210\u4e25\u91cd\u5a01\u80c1\uff0c\u8fdd\u80cc\u56fd\u9645\u5173\u7cfb\u57fa\u672c\u51c6\u5219\u3002\u4e2d\u65b9\u5bf9\u6b64\u8868\u793a\u5f3a\u70c8\u4e0d\u6ee1\uff0c\u5c06\u91c7\u53d6\u5fc5\u8981\u63aa\u65bd\uff0c\u575a\u5b9a\u7ef4\u62a4\u4e2d\u56fd\u4e3b\u6743\u3002\u9646\u6177\u8868\u793a\uff0c\u4e2d\u56fd\u5bf9\u9ec4\u5ca9\u5c9b\u53ca\u5176\u9644\u8fd1\u6d77\u57df\u62e5\u6709\u65e0\u53ef\u4e89\u8fa9\u7684\u4e3b\u6743\u3002\u4e2d\u65b9\u4e00\u5411\u5c0a\u91cd\u548c\u7ef4\u62a4\u5404\u56fd\u4f9d\u636e\u56fd\u9645\u6cd5\u5728\u5357\u6d77\u4eab\u6709\u7684\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\uff0c\u4f46\u575a\u51b3\u53cd\u5bf9\u4efb\u4f55\u56fd\u5bb6\u4ee5\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\u4e3a\u540d\uff0c\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\u3002\u201c\u6211\u4eec\u5f3a\u70c8\u6566\u4fc3\u7f8e\u65b9\u7acb\u5373\u7ea0\u6b63\u9519\u8bef\uff0c\u505c\u6b62\u6b64\u7c7b\u6311\u8845\u884c\u4e3a\uff0c\u4ee5\u514d\u635f\u5bb3\u4e2d\u7f8e\u5173\u7cfb\u548c\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002\u201d\u56fd\u9632\u90e8\u7f511\u670820\u65e5\u6d88\u606f\uff0c1\u670817\u65e5\uff0c\u7f8e\u56fd\u6d77\u519b\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b\u90bb\u8fd1\u6d77\u57df\uff0c\u4e2d\u56fd\u6d77\u519b\u201c\u9ec4\u5c71\u201d\u53f7\u5bfc\u5f39\u62a4\u536b\u8230\u5f53\u5373\u884c\u52a8\uff0c\u5bf9\u7f8e\u8230\u8fdb\u884c\u8bc6\u522b\u67e5\u8bc1\uff0c\u5e76\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u5f53\u524d\uff0c\u5728\u4e2d\u56fd\u548c\u4e1c\u76df\u56fd\u5bb6\u7684\u5171\u540c\u52aa\u529b\u4e0b\uff0c\u5357\u6d77\u5c40\u52bf\u4e0d\u65ad\u8d8b\u7a33\u5411\u597d\u3002\u5728\u6b64\u5f62\u52bf\u4e0b\uff0c\u7f8e\u65b9\u4e00\u518d\u6d3e\u9063\u519b\u8230\u975e\u6cd5\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u5c9b\u7901\u90bb\u8fd1\u6d77\u57df\uff0c\u5371\u53ca\u53cc\u65b9\u8230\u673a\u548c\u4eba\u5458\u5b89\u5168\uff0c\u5a01\u80c1\u4e2d\u56fd\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u7834\u574f\u5730\u533a\u548c\u5e73\u7a33\u5b9a\uff0c\u4e0e\u4e24\u56fd\u4e24\u519b\u5173\u7cfb\u7a33\u5b9a\u53d1\u5c55\u7684\u52bf\u5934\u80cc\u9053\u800c\u9a70\u3002\u6211\u4eec\u5e0c\u671b\u7f8e\u65b9\u5c0a\u91cd\u4e2d\u65b9\u4e3b\u6743\uff0c\u5c0a\u91cd\u57df\u5185\u56fd\u5bb6\u7684\u52aa\u529b\uff0c\u4e0d\u8981\u65e0\u4e8b\u751f\u975e\uff0c\u5174\u98ce\u4f5c\u6d6a\u3002\u4e2d\u56fd\u519b\u961f\u5c06\u7ee7\u7eed\u5c65\u884c\u9632\u536b\u804c\u8d23\uff0c\u52a0\u5927\u6d77\u7a7a\u5de1\u903b\u8b66\u6212\u529b\u5ea6\uff0c\u575a\u5b9a\u634d\u536b\u56fd\u5bb6\u7684\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u575a\u5b9a\u7ef4\u62a4\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002"}} -------------------------------------------------------------------------------- /机器学习入门/标签传播算法(LP).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 11:28 4 | # @Author : mazicwong 5 | # @File : 标签传播算法(LP).py 6 | import time 7 | import math 8 | import numpy as np 9 | from label_propagation import labelPropagation 10 | 11 | 12 | # show 13 | def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels): 14 | import matplotlib.pyplot as plt 15 | 16 | for i in range(Mat_Label.shape[0]): 17 | if int(labels[i]) == 0: 18 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr') 19 | elif int(labels[i]) == 1: 20 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db') 21 | else: 22 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy') 23 | 24 | for i in range(Mat_Unlabel.shape[0]): 25 | if int(unlabel_data_labels[i]) == 0: 26 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or') 27 | elif int(unlabel_data_labels[i]) == 1: 28 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob') 29 | else: 30 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy') 31 | 32 | plt.xlabel('X1'); 33 | plt.ylabel('X2') 34 | plt.xlim(0.0, 12.) 35 | plt.ylim(0.0, 12.) 36 | plt.show() 37 | 38 | 39 | def loadCircleData(num_data): 40 | center = np.array([5.0, 5.0]) 41 | radiu_inner = 2 42 | radiu_outer = 4 43 | num_inner = num_data / 3 44 | num_outer = num_data - num_inner 45 | 46 | data = [] 47 | theta = 0.0 48 | for i in range(int(num_inner)): 49 | pho = (theta % 360) * math.pi / 180 50 | tmp = np.zeros(2, np.float32) 51 | tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0] 52 | tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1] 53 | data.append(tmp) 54 | theta += 2 55 | 56 | theta = 0.0 57 | for i in range(int(num_outer)): 58 | pho = (theta % 360) * math.pi / 180 59 | tmp = np.zeros(2, np.float32) 60 | tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0] 61 | tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1] 62 | data.append(tmp) 63 | theta += 1 64 | 65 | Mat_Label = np.zeros((2, 2), np.float32) 66 | Mat_Label[0] = center + np.array([-radiu_inner + 0.5, 0]) 67 | Mat_Label[1] = center + np.array([-radiu_outer + 0.5, 0]) 68 | labels = [0, 1] 69 | Mat_Unlabel = np.vstack(data) 70 | return Mat_Label, labels, Mat_Unlabel 71 | 72 | 73 | def loadBandData(num_unlabel_samples): 74 | # Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]]) 75 | # labels = [0, 1] 76 | # Mat_Unlabel = np.array([[5.1, 2.], [5.0, 8.1]]) 77 | 78 | Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]]) 79 | labels = [0, 1] 80 | num_dim = Mat_Label.shape[1] 81 | Mat_Unlabel = np.zeros((num_unlabel_samples, num_dim), np.float32) 82 | Mat_Unlabel[:num_unlabel_samples / 2, :] = (np.random.rand(num_unlabel_samples / 2, num_dim) - 0.5) * np.array( 83 | [3, 1]) + Mat_Label[0] 84 | Mat_Unlabel[num_unlabel_samples / 2: num_unlabel_samples, :] = (np.random.rand(num_unlabel_samples / 2, 85 | num_dim) - 0.5) * np.array([3, 1]) + \ 86 | Mat_Label[1] 87 | return Mat_Label, labels, Mat_Unlabel 88 | 89 | 90 | # main function 91 | if __name__ == "__main__": 92 | num_unlabel_samples = 800 93 | # Mat_Label, labels, Mat_Unlabel = loadBandData(num_unlabel_samples) 94 | Mat_Label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples) 95 | 96 | ## Notice: when use 'rbf' as our kernel, the choice of hyper parameter 'sigma' is very import! It should be 97 | ## chose according to your dataset, specific the distance of two data points. I think it should ensure that 98 | ## each point has about 10 knn or w_i,j is large enough. It also influence the speed of converge. So, may be 99 | ## 'knn' kernel is better! 100 | # unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.2) 101 | unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='knn', knn_num_neighbors=10, 102 | max_iter=400) 103 | show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels) -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HJ6VRF0001875O.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HJ6VRF0001875O", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_guoji2_bbs/D8HJ6VRF0001875O.html"}, "newsId": "D8HJ6VRF0001875O", "contents": {"passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94\uff09\n

\u6d77\u5916\u7f511\u670819\u65e5\u7535\u00a0\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u617719\u65e5\u4e3b\u6301\u4f8b\u884c\u8bb0\u8005\u4f1a\uff0c\u5c31\u8fd1\u671f\u70ed\u70b9\u8fdb\u884c\u56de\u5e94\u3002\u76f8\u5173\u5185\u5bb9\u5982\u4e0b\uff1a

\u95ee\uff1a\u5a92\u4f53\u62ab\u9732\u7684\u6700\u65b0\u536b\u661f\u56fe\u50cf\u663e\u793a\uff0c\u4e2d\u56fd\u6b63\u5728\u8ddd\u6d1e\u6717\u5bf9\u5cd9\u53d1\u751f\u5730\u5f88\u8fd1\u7684\u5730\u65b9\u4fee\u5efa\u5e9e\u5927\u7684\u519b\u4e8b\u8bbe\u65bd\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u53d1\u8868\u58f0\u660e\u91cd\u7533\u8be5\u8bbe\u65bd\u5e76\u4e0d\u5728\u5bf9\u5cd9\u5730\u533a\u3002\u4f46\u8fd9\u5728\u5370\u5ea6\u653f\u515a\u4e2d\u5f15\u53d1\u4e86\u62c5\u5fe7\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u8fd8\u79f0\u201c\u6b64\u524d\u5bf9\u5cd9\u5730\u70b9\u7684\u73b0\u72b6\u5e76\u672a\u53d1\u751f\u6539\u53d8\u201d\u3002\u4e2d\u65b9\u5bf9\u6709\u5173\u62a5\u9053\u6709\u4f55\u8bc4\u8bba\uff1f

\u7b54\uff1a\u6211\u521a\u521a\u6ce8\u610f\u5230\u6709\u5173\u62a5\u9053\uff0c\u4e0d\u4e86\u89e3\u5177\u4f53\u60c5\u51b5\uff0c\u4e5f\u4e0d\u6e05\u695a\u4f60\u6240\u8bf4\u7684\u536b\u661f\u56fe\u50cf\u6765\u6e90\u3002

\u76f8\u4fe1\u4f60\u975e\u5e38\u6e05\u695a\u4e2d\u65b9\u5728\u6d1e\u6717\u95ee\u9898\u4e0a\u7684\u7acb\u573a\u3002\u6d1e\u6717\u5730\u533a\u5386\u6765\u5c5e\u4e8e\u4e2d\u56fd\uff0c\u4e00\u76f4\u5728\u4e2d\u56fd\u6709\u6548\u7ba1\u8f96\u4e4b\u4e0b\uff0c\u4e0d\u5b58\u5728\u4e89\u8bae\u3002\u4e3a\u4e86\u5b88\u8fb9\u9700\u8981\u548c\u6539\u5584\u5f53\u5730\u519b\u6c11\u7684\u751f\u4ea7\u751f\u6d3b\u6761\u4ef6\uff0c\u4e2d\u65b9\u957f\u671f\u4ee5\u6765\u4e00\u76f4\u5728\u6d1e\u6717\u5730\u533a\u8fdb\u884c\u5305\u62ec\u9053\u8def\u5728\u5185\u7684\u57fa\u7840\u8bbe\u65bd\u5efa\u8bbe\uff0c\u8fd9\u662f\u4e2d\u65b9\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\uff0c\u5b8c\u5168\u6b63\u5f53\u5408\u6cd5\u3002\u6b63\u5982\u4e2d\u65b9\u4e0d\u4f1a\u5bf9\u5370\u65b9\u5728\u5370\u5ea6\u9886\u571f\u4e0a\u7684\u5efa\u8bbe\u6d3b\u52a8\u54c1\u5934\u8bba\u8db3\u4e00\u6837\uff0c\u5176\u4ed6\u56fd\u5bb6\u5bf9\u4e2d\u56fd\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\u54c1\u5934\u8bba\u8db3\u4e5f\u662f\u4e0d\u5408\u9002\u7684\u3002

\n

\u95ee\uff1a\u8003\u8651\u5230\u8fd9\u4e2a\u62a5\u9053\u8868\u8fbe\u4e86\u5bf9\u6d1e\u6717\u5730\u533a\u518d\u6b21\u53d1\u751f\u5bf9\u5cd9\u7684\u62c5\u5fe7\u3002\u53bb\u5e74\u7684\u5bf9\u5cd9\u4e8b\u4ef6\u5e94\u8be5\u5df2\u7ecf\u5f97\u5230\u4e86\u89e3\u51b3\uff0c\u4f60\u8ba4\u4e3a\u4f1a\u518d\u6b21\u53d1\u751f\u7c7b\u4f3c\u4e8b\u4ef6\u5417\uff1f

\u7b54\uff1a\u6709\u5173\u5370\u5ea6\u8fb9\u9632\u90e8\u961f\u8d8a\u754c\u9020\u6210\u7684\u6d1e\u6717\u5bf9\u5cd9\u4e8b\u4ef6\uff0c\u524d\u4e24\u5929\u6211\u5df2\u7ecf\u8bf4\u8fc7\uff0c\u5370\u5ea6\u519b\u65b9\u7684\u9ad8\u5b98\u4e5f\u627f\u8ba4\u662f\u5370\u5ea6\u519b\u961f\u8d8a\u754c\u4e86\u3002\u8fd9\u4e00\u4e8b\u4ef6\u4f7f\u4e2d\u5370\u53cc\u8fb9\u5173\u7cfb\u7ecf\u53d7\u4e86\u4e25\u5cfb\u8003\u9a8c\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u519b\u65b9\u80fd\u591f\u6c72\u53d6\u6559\u8bad\uff0c\u907f\u514d\u7c7b\u4f3c\u4e8b\u60c5\u518d\u6b21\u53d1\u751f\u3002\u4e2d\u5370\u4e24\u56fd\u9886\u5bfc\u4eba\u5728\u53bb\u5e749\u6708\u91d1\u7816\u56fd\u5bb6\u9886\u5bfc\u4eba\u53a6\u95e8\u4f1a\u6664\u671f\u95f4\uff0c\u5df2\u7ecf\u5c31\u5982\u4f55\u5728\u65b0\u5f62\u52bf\u4e0b\u8fdb\u4e00\u6b65\u6539\u5584\u548c\u53d1\u5c55\u4e2d\u5370\u5173\u7cfb\u8fbe\u6210\u4e86\u91cd\u8981\u5171\u8bc6\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u6709\u5173\u65b9\u9762\u80fd\u5207\u5b9e\u9075\u7167\u4e24\u56fd\u9886\u5bfc\u4eba\u8fbe\u6210\u7684\u91cd\u8981\u5171\u8bc6\uff0c\u540c\u4e2d\u65b9\u76f8\u5411\u800c\u884c\uff0c\u5171\u540c\u7ef4\u62a4\u8fb9\u5883\u5730\u533a\u7684\u548c\u5e73\u7a33\u5b9a\uff0c\u5171\u540c\u81f4\u529b\u4e8e\u4e2d\u5370\u5173\u7cfb\u7684\u6539\u5584\u53d1\u5c55\u3002

", "link": "http://news.163.com/18/0119/18/D8HJ6VRF0001875O.html", "title": ["\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/009612.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "009612", "comments": {"link": "http://coral.qq.com/2369744788"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/009612.htm", "title": ["\u592e\u884c\u53d1\u5e03\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u901a\u77e5 \u2161\u3001\u2162\u7c7b\u6237\u5f00\u6237\u5c06\u66f4\u4fbf\u6377"], "passage": "\u592e\u5e7f\u7f51\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff08\u8bb0\u8005\u67f4\u534e\uff09\u636e\u4e2d\u56fd\u4e4b\u58f0\u300a\u592e\u5e7f\u65b0\u95fb\u300b\u62a5\u9053\uff0c\u6628\u5929\uff0819\u65e5\uff09\u665a\u95f4\uff0c\u592e\u884c\u5b98\u7f51\u53d1\u5e03\u300a\u5173\u4e8e\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u6709\u5173\u4e8b\u9879\u7684\u901a\u77e5\u300b\uff0c\u5ba3\u5e03\u8fdb\u4e00\u6b65\u53d1\u6325\u2162\u7c7b\u6237\u5728\u5c0f\u989d\u652f\u4ed8\u9886\u57df\u7684\u4f5c\u7528\uff0c\u63a8\u52a8\u2161\u3001\u2162\u7c7b\u6237\u6210\u4e3a\u4e2a\u4eba\u529e\u7406\u7f51\u4e0a\u652f\u4ed8\u3001\u79fb\u52a8\u652f\u4ed8\u7b49\u5c0f\u989d\u6d88\u8d39\u4e1a\u52a1\u7684\u4e3b\u8981\u6e20\u9053\u3002\u6839\u636e\u300a\u901a\u77e5\u300b\u548c\u7b54\u8bb0\u8005\u95ee\u7684\u89e3\u91ca\uff0c\u4e00\u662f\u5f00\u6237\u6e20\u9053\u591a\u6837\u3002\u300a\u901a\u77e5\u300b\u8981\u6c42\u56fd\u6709\u5546\u4e1a\u94f6\u884c\u3001\u80a1\u4efd\u5236\u5546\u4e1a\u94f6\u884c\u7b49\u5e94\u4e8e2018\u5e746\u6708\u5e95\u524d\u5b9e\u73b0\u672c\u94f6\u884c\u67dc\u9762\u548c\u7f51\u4e0a\u94f6\u884c\u3001\u624b\u673a\u94f6\u884c\u3001\u76f4\u9500\u94f6\u884c\u3001\u8fdc\u7a0b\u89c6\u9891\u67dc\u5458\u673a\u548c\u667a\u80fd\u67dc\u5458\u673a\u7b49\u7535\u5b50\u6e20\u9053\u529e\u7406\u4e2a\u4eba\u2161\u3001\u2162\u7c7b\u6237\u5f00\u7acb\u7b49\u4e1a\u52a1\uff0c\u5176\u4ed6\u94f6\u884c\u5219\u5e94\u57282018\u5e74\u5e95\u524d\u5b9e\u73b0\u3002\u4e8c\u662f\u5f00\u6237\u624b\u7eed\u7b80\u5316\u3002\u300a\u901a\u77e5\u300b\u660e\u786e\u4e00\u5b9a\u524d\u63d0\u4e0b\u5f00\u7acb\u2161\u3001\u2162\u7c7b\u6237\u65f6\u65e0\u9700\u4e2a\u4eba\u586b\u5199\u8eab\u4efd\u4fe1\u606f\u3001\u51fa\u793a\u8eab\u4efd\u8bc1\u4ef6\u7b49\uff0c\u5728\u6709\u6548\u843d\u5b9e\u8d26\u6237\u5b9e\u540d\u5236\u8981\u6c42\u7684\u540c\u65f6\uff0c\u5927\u5e45\u63d0\u5347\u5f00\u6237\u4f53\u9a8c\u3002\u5176\u6b21\uff0c\u5728\u8d26\u6237\u4f7f\u7528\u65b9\u9762\uff0c\u5728\u6ee1\u8db3\u53cd\u6d17\u94b1\u3001\u53cd\u8bc8\u9a97\u8981\u6c42\u7684\u524d\u63d0\u4e0b\uff0c\u653e\u5bbd\u2162\u7c7b\u6237\u7684\u4f7f\u7528\u9650\u5236\u3002\u4e00\u662f\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u2162\u7c7b\u6237\u80fd\u591f\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u4ee5\u6ee1\u8db3\u4e2a\u4eba\u4e4b\u95f4\u5c0f\u989d\u6536\u4ed8\u6b3e\u3001\u53d1\u653e\u7ea2\u5305\u3001\u4e0e\u4e2a\u4eba\u652f\u4ed8\u8d26\u6237\u5bf9\u63a5\u3001\u94f6\u884c\u6216\u5546\u6237\u5c0f\u989d\u8fd4\u73b0\u5956\u52b1\u7b49\u573a\u666f\u9700\u6c42\u3002\u4e8c\u662f\u2162\u7c7b\u6237\u8d26\u6237\u4f59\u989d\u4ece1000\u5143\u63d0\u5347\u4e3a2000\u5143\u3002\u4e09\u662f\u5141\u8bb8\u94f6\u884c\u5411\u2162\u7c7b\u6237\u53d1\u653e\u672c\u884c\u5c0f\u989d\u6d88\u8d39\u8d37\u6b3e\u5e76\u901a\u8fc7\u2162\u7c7b\u6237\u8fd8\u6b3e\uff0c\u9f13\u52b1\u94f6\u884c\u57fa\u4e8e\u2162\u7c7b\u6237\u63d0\u4f9b\u66f4\u591a\u5143\u5316\u7684\u4ea7\u54c1\u8bbe\u8ba1\u548c\u529f\u80fd\u7ec4\u5408\u3002\u592e\u884c\u8868\u793a\uff0c\u300a\u901a\u77e5\u300b\u91c7\u53d6\u4e86\u591a\u79cd\u5b89\u5168\u9632\u8303\u63aa\u65bd\u3002\u4e00\u662f\u5c06\u2162\u7c7b\u6237\u6d88\u8d39\u548c\u7f34\u8d39\u652f\u4ed8\u3001\u975e\u7ed1\u5b9a\u8d26\u6237\u8d44\u91d1\u8f6c\u51fa\u7b49\u51fa\u91d1\u7684\u65e5\u7d2f\u8ba1\u9650\u989d\u4ece\u539f5000\u5143\u4e0b\u8c03\u81f32000\u5143\uff0c\u5e74\u7d2f\u8ba1\u9650\u989d\u4ece\u539f10\u4e07\u5143\u4e0b\u8c03\u4e3a5\u4e07\u5143\uff0c\u901a\u8fc7\u63a7\u5236\u2162\u7c7b\u6237\u652f\u51fa\u989d\u5ea6\uff0c\u786e\u4fdd\u98ce\u9669\u76f8\u5bf9\u53ef\u63a7\u3002\u4e8c\u662f\u89c4\u5b9a\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u7684\u2162\u7c7b\u6237\u901a\u8fc7\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u540e\uff0c\u624d\u53ef\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u9632\u8303\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u83b7\u53d6\u4ed6\u4eba\u8eab\u4efd\u4fe1\u606f\u548c\u94f6\u884c\u8d26\u6237\u4fe1\u606f\u540e\u5192\u540d\u5f00\u7acb\u3002\u4e09\u662f\u89c4\u5b9a\u540c\u4e00\u5bb6\u94f6\u884c\u901a\u8fc7\u7ebf\u4e0a\u4e3a\u540c\u4e00\u4e2a\u4eba\u53ea\u80fd\u5f00\u7acb\u4e00\u4e2a\u5141\u8bb8\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u7684\u2162\u7c7b\u6237\uff0c\u9632\u6b62\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u5f00\u7acb\u591a\u4e2a\u6b64\u7c7b\u8d26\u6237\u53d8\u76f8\u6269\u5927\u2162\u7c7b\u6237\u7684\u8f6c\u8d26\u9650\u989d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20171009/039986.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "039986", "comments": {"link": "http://coral.qq.com/2166352744"}, "date": "20171009", "contents": {"link": "https://news.qq.com/a/20171009/039986.htm", "title": ["\u8bbe\u8ba1\u7f8e\u5b66\uff0c\u8ba9\u6b27\u7c73\u8304\u6d77\u9a6cAqua Terra\u8155\u8868\u7115\u7136\u4e00\u65b0"], "passage": "[]\u6b27\u7c73\u8304\u63a8\u51fa\u6d77\u9a6c\u7cfb\u5217Aqua Terra\u5168\u65b0\u8868\u6b3e\uff0c\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u57fa\u7840\u4e0a\uff0c\u878d\u5165\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\u3002\u6b27\u7c73\u8304\u5168\u65b0\u53d1\u5e03\u7684\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\uff0c\u7b80\u7ea6\u3001\u5927\u6c14\uff0c\u5448\u73b0\u5e73\u8861\u4e4b\u7f8e\u3002\u8868\u6b3e\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u5143\u7d20\u4e2d\u878d\u5165\u8bf8\u591a\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\uff0c\u901a\u8fc7\u81f3\u81fb\u5929\u6587\u53f0\u8ba4\u8bc1\uff0c\u521b\u9020\u7684\u4f18\u96c5\u5353\u8d8a\u9b45\u529b\u65f6\u8ba1\uff0c\u4ee4\u4eba\u96be\u4ee5\u6297\u62d2\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u914d\u5907\u4e09\u89d2\u5f62\u5c0f\u65f6\u523b\u5ea6\uff0c\u98ce\u683c\u5927\u6c14\u7eaf\u7cb9\uff0c\u540c\u65f6\u62e5\u6709\u5f88\u9ad8\u7684\u6613\u8bfb\u6027\u3002\u8fd9\u6b21\uff0c\u6b27\u7c73\u8304\u5c06\u8868\u76d8\u8bbe\u8ba1\u518d\u7b80\u5316\uff0c\u5728\u4fdd\u7559\u7ecf\u5178\u7684\u5f27\u5f62\u8868\u8033\u7684\u57fa\u7840\u4e0a\u5bf9\u8868\u58f3\u8fdb\u884c\u4e86\u91cd\u65b0\u8bbe\u8ba1\uff0c\u4e3a\u8868\u80cc\u589e\u6dfb\u6ce2\u7eb9\u8fb9\u7f18\uff0c\u4ee4\u6574\u679a\u8155\u8868\u5c55\u73b0\u5bf9\u79f0\u4e4b\u7f8e\u3002\u539f\u672c\u8868\u76d8\u4e0a\u7684\u9632\u6c34\u7cfb\u6570\u5b57\u6837\u88ab\u8f6c\u79fb\u81f3\u8868\u80cc\uff0c\u65e5\u671f\u7a97\u53e3\u4e5f\u75313\u70b9\u4f4d\u7f6e\u8c03\u6574\u81f36\u70b9\u4f4d\u7f6e\uff0c\u7528\u4ee5\u81f4\u656c1952\u5e74\u63a8\u51fa\u7684\u9996\u6b3e\u5e26\u6709\u65e5\u671f\u7a97\u663e\u793a\u7684\u6b27\u7c73\u8304\u8155\u8868\uff0c\u8d2f\u5f7b\u5bf9\u79f0\u7b80\u7ea6\u7684\u8bbe\u8ba1\u7f8e\u5b66\u3002\u67da\u6728\u7eb9\u7406\u8868\u76d8\u582a\u79f0\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u6700\u4e3a\u663e\u8457\u7684\u7279\u5f81\uff0c\u5176\u8bbe\u8ba1\u7075\u611f\u6765\u6e90\u4e8e\u6e38\u8247\u4e0a\u7684\u67da\u6728\u7532\u677f\u30022017\u5e74\uff0c\u6b27\u7c73\u8304\u5c06\u6807\u5fd7\u6027\u7684\u5782\u76f4\u7eb9\u7406\u53d8\u4e3a\u6c34\u5e73\u7eb9\u7406\uff0c\u4ee4\u6574\u679a\u8155\u8868\u66f4\u663e\u7cbe\u81f4\uff0c\u7115\u53d1\u5d2d\u65b0\u9b45\u529b\u3002\u8bbe\u8ba1\u7b80\u6d01\u53c8\u4e0d\u5931\u7cbe\u81f4\uff0c\u5448\u73b0\u4e86\u4e0e\u6d77\u6d0b\u76f8\u5951\u5408\u7684\u4f11\u95f2\u751f\u6d3b\u65b9\u5f0f\u3002\u6b27\u7c73\u8304\u5728\u6b64\u6b3e\u8868\u5e26\u8bbe\u8ba1\u4e0a\u4e5f\u5320\u5fc3\u72ec\u8fd0\uff0c\u90e8\u5206\u8868\u6b3e\u642d\u914d\u6a61\u80f6\u8868\u5e26\uff0c\u521b\u9020\u6027\u5730\u901a\u8fc7\u7cbe\u94a2\u6216Sedna\u00ae 18K\u91d1\u94fe\u8282\u5c06\u8868\u5e26\u4e0e\u8868\u58f3\u76f8\u8fde\uff0c\u4ee4\u8155\u8868\u62e5\u6709\u8212\u9002\u79f0\u624b\u7684\u4f69\u5e26\u611f\u53d7\uff0c\u66f4\u52a0\u5bcc\u6709\u8fd0\u52a8\u6c14\u606f\u3002\u91d1\u5c5e\u8868\u94fe\u5219\u8fd0\u7528\u4e86\u6b27\u7c73\u8304\u4e13\u5229\u7684\u94fe\u9488\u8868\u94fe\u4e0e\u66f4\u4e3a\u575a\u56fa\u7684\u94fe\u8282\uff0c\u6574\u4f53\u7f8e\u611f\u500d\u589e\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u7cfb\u5217\u62e5\u670941mm\u548c38mm\u4e24\u79cd\u8868\u58f3\u5c3a\u5bf8\u3002\u8155\u8868\u8868\u58f3\u91c7\u7528\u7cbe\u94a2\u3001Sedna\u00ae 18K\u91d1\u6216\u7cbe\u94a2\u4e0eSedna\u00ae 18K\u91d1\u6df7\u5408\u6253\u9020\u800c\u6210\uff0c\u5177\u6709\u4e30\u5bcc\u7684\u8868\u6b3e\u53ef\u4f9b\u9009\u62e9\u3002\u540c\u65f6\u8155\u8868\u8fd8\u53ef\u642d\u914d\u7cbe\u94a2\u8868\u94fe\u3001\u76ae\u9769\u8868\u5e26\u6216\u9020\u578b\u7cbe\u81f4\u7684\u6a61\u80f6\u8868\u5e26\u3002\u591a\u79cd\u4e0d\u540c\u8868\u6b3e\uff0c\u642d\u914d\u7537\u58eb\u72ec\u4e00\u65e0\u4e8c\u7684\u98ce\u683c\u3002\u9646\u5730\u4e0e\u6d77\u6d0b\u3001\u4f20\u627f\u4e0e\u521b\u65b0\u3001\u5de5\u4f5c\u4e0e\u4f11\u95f2\uff0c\u8fd9\u5c31\u662f\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u6240\u878d\u5408\u7684\u72ec\u7279\u9b45\u529b\uff0c\u4ee4\u5176\u6210\u4e3a\u5de5\u4f5c\u751f\u6d3b\uff0c\u65f6\u5c1a\u642d\u914d\u4e2d\u7684\u81f3\u81fb\u4e4b\u9009\u3002\u8bf7\u70b9\u51fb\u94fe\u63a5\uff0c\u4e86\u89e3\u66f4\u591a\u4ea7\u54c1\u4fe1\u606f\u3002\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/004124.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "004124", "comments": {"link": "http://coral.qq.com/2369229201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004124.htm", "title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "passage": "\u4e2d\u65b0\u7f511\u670820\u65e5\u7535 \u636e\u4e2d\u592e\u6c14\u8c61\u53f0\u7f51\u7ad9\u6d88\u606f\uff0c\u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff0c\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002\u6b64\u5916\uff0c20\u81f321\u65e5\u6cb3\u5317\u5357\u90e8\u3001\u6cb3\u5357\u3001\u5c71\u4e1c\u4e2d\u897f\u90e8\u3001\u6c5f\u82cf\u3001\u5b89\u5fbd\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7ef4\u6301\uff0c\u5176\u4e2d21\u65e5\u53d7\u504f\u4e1c\u8def\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u6cb3\u5317\u4e1c\u90e8\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7565\u6709\u51cf\u5f31\u300222\u65e5\u591c\u95f4\u8d77\uff0c\u53d7\u8f83\u5f3a\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u533a\u57df\u91cd\u6c61\u67d3\u5929\u6c14\u81ea\u5317\u5411\u5357\u9010\u6e10\u51cf\u5f31\u6d88\u6563\u300220\u65e5\uff0c\u53d7\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u5185\u8499\u53e4\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u7b49\u5730\u6709\u5927\u98ce\u964d\u6e29\u5929\u6c14\uff0c\u964d\u6e29\u5e45\u5ea6\u57284~6\u2103\uff0c\u5c40\u5730\u53ef\u8fbe8\u2103\u4ee5\u4e0a\uff0c\u5e76\u4f34\u67094~6\u7ea7\u98ce\u300222\u65e5\u8d77\uff0c\u65b0\u4e00\u80a1\u51b7\u7a7a\u6c14\u5c06\u5f71\u54cd\u6211\u56fd\u4e2d\u4e1c\u90e8\u5730\u533a\uff0c\u957f\u6c5f\u4e2d\u4e0b\u6e38\u53ca\u5176\u4ee5\u5317\u5730\u533a\u67094~6\u7ea7\u504f\u5317\u98ce\uff0c\u4e2d\u4e1c\u90e8\u5927\u90e8\u5730\u533a\u6c14\u6e29\u5c06\u4e0b\u964d4~8\u2103\uff0c\u5185\u8499\u53e4\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u90e8\u5c40\u5730\u964d\u6e2910\u2103\u4ee5\u4e0a\u3002\u672a\u6765\u4e09\u5929\u9884\u62a5\u65b9\u9762\uff0c20\u65e508\u65f6\u81f321\u65e508\u65f6\uff0c\u65b0\u7586\u4f0a\u7281\u6cb3\u8c37\u548c\u5929\u5c71\u5730\u533a\u3001\u7518\u8083\u897f\u90e8\u3001\u6cb3\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u5730\u533a\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96ea\u6216\u9635\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6c5f\u6c49\u5357\u90e8\u3001\u6c5f\u5357\u897f\u90e8\u548c\u5317\u90e8\u3001\u534e\u5357\u897f\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u897f\u90e8\u3001\u9ed1\u9f99\u6c5f\u5317\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u300221\u65e508\u65f6\u81f322\u65e508\u65f6\uff0c\u5357\u65b9\u964d\u6c34\u8303\u56f4\u6269\u5927\u3002\u534e\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u3001\u5c71\u4e1c\u4e1c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff0c\u5176\u4e2d\uff0c\u6cb3\u5317\u5317\u90e8\u3001\u5c71\u4e1c\u534a\u5c9b\u5c40\u5730\u6709\u4e2d\u96ea\uff1b\u6c5f\u6dee\u4e1c\u90e8\u548c\u5357\u90e8\u3001\u6e56\u5317\u5357\u90e8\u3001\u6c5f\u5357\u3001\u897f\u5357\u5730\u533a\u4e1c\u5357\u90e8\u3001\u534e\u5357\u897f\u90e8\u548c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u4e2d\u897f\u90e8\u3001\u8fbd\u4e1c\u534a\u5c9b\u3001\u5c71\u4e1c\u534a\u5c9b\u7b49\u5730\u67094~6\u7ea7\u98ce\u300222\u65e508\u65f6\u81f323\u65e508\u65f6\uff0c\u6cb3\u5317\u4e2d\u90e8\u3001\u5c71\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u5357\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6e56\u5317\u897f\u90e8\u3001\u6e56\u5357\u897f\u90e8\u548c\u5357\u90e8\u3001\u5e7f\u897f\u7b49\u5730\u6709\u5c0f\u96e8\u3002\u5185\u8499\u53e4\u5927\u90e8\u3001\u534e\u5317\u3001\u8fbd\u5b81\u3001\u9ec4\u6dee\u4e1c\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u3002\u4e1c\u6d77\u5927\u90e8\u3001\u5357\u6d77\u4e1c\u5317\u90e8\u5c06\u67096~8\u7ea7\u3001\u9635\u98ce9\u7ea7\u5927\u98ce\u3002\u9632\u5fa1\u6307\u5357\uff1a1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168\uff1b2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8GOCKJU0001899N.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8GOCKJU0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8GOCKJU0001899N.html"}, "contents": {"title": ["\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d"], "link": "http://news.163.com/18/0119/10/D8GOCKJU0001899N.html", "passage": "

\n \uff08\u539f\u6807\u9898\uff1a\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\uff09\n

2017\u5e74\u5168\u56fd\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u548c\u4eba\u53e3\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\u3002\u56fd\u5bb6\u7edf\u8ba1\u5c4018\u65e5\u516c\u5e03\u6570\u636e\u663e\u793a\uff0c2017\u5e74\u5168\u5e74\u5171\u51fa\u751f\u4eba\u53e31723\u4e07\u4eba\uff0c\u6bd42016\u5e74\u51cf\u5c1163\u4e07\u4eba\u3002\u540c\u65f6\u8001\u9f84\u5316\u7a0b\u5ea6\u7ee7\u7eed\u52a0\u5927\uff0c60\u5c81\u4ee5\u4e0a\u53ca65\u5c81\u4ee5\u4e0a\u8001\u4eba\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u90fd\u6709\u660e\u663e\u4e0a\u5347\u3002

\u51fa\u751f\u4eba\u6570\u51cf\u5c11

\u53bb\u5e74\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\u5b9e\u65bd\u7684\u7b2c\u4e8c\u5e74\u3002\u6839\u636e\u6b64\u524d\u6709\u5173\u65b9\u9762\u7684\u5224\u65ad\uff0c\u5168\u9762\u4e24\u5b69\u7684\u653f\u7b56\u6548\u679c\u4f53\u73b0\u6709\u6ede\u540e\u6027\uff0c\u5e94\u8be5\u57282017\u5e74\u4e4b\u540e\u9010\u6b65\u663e\u73b0\uff0c\u56e0\u6b642017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u4f1a\u660e\u663e\u9ad8\u4e8e2016\u5e74\u3002\u4f46\u4ece\u56fd\u5bb6\u7edf\u8ba1\u5c40\u516c\u5e03\u7684\u6570\u636e\u6765\u770b\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6bd42016\u5e74\u76841786\u4e07\u4eba\u51cf\u5c11\u4e8663\u4e07\u4eba\u3002

\u4eba\u53e3\u51fa\u751f\u7387\u4e5f\u540c\u6837\u51fa\u73b0\u4e86\u4e0b\u964d\u3002\u53bb\u5e74\u5168\u56fd\u4eba\u53e3\u51fa\u751f\u7387\u4e3a12.43\u2030\uff0c2016\u5e74\u8fd9\u4e00\u6570\u636e\u4e3a12.95\u2030\u3002

\u4e2d\u56fd\u793e\u4f1a\u79d1\u5b66\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u4eba\u53e3\u7edf\u8ba1\u5ba4\u4e3b\u4efb\u738b\u5e7f\u5dde\u8868\u793a\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u6bd42016\u5e74\u8fd8\u8981\u5c11\uff0c\u8fd9\u4e3b\u8981\u662f\u56e0\u4e3a\u4e00\u5b69\u51fa\u751f\u6570\u91cf\u4e0b\u964d\u5e45\u5ea6\u5f88\u5927\uff0c\u5982\u679c\u4e0d\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\uff0c\u51fa\u751f\u89c4\u6a21\u4e0b\u964d\u5e45\u5ea6\u4f1a\u66f4\u5927\u3002

\u957f\u671f\u5173\u6ce8\u751f\u80b2\u610f\u613f\u4e0e\u751f\u80b2\u884c\u4e3a\u7814\u7a76\u7684\u793e\u79d1\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u7814\u7a76\u5458\u90d1\u771f\u771f\u8868\u793a\uff0c2017\u5e74\u51fa\u73b0\u51fa\u751f\u4eba\u53e3\u7684\u4e0b\u964d\u8bf4\u660e\uff0c\u4e2a\u4eba\u751f\u80b2\u610f\u613f\u548c\u751f\u80b2\u884c\u4e3a\u53d7\u5230\u5f88\u591a\u590d\u6742\u56e0\u7d20\u7684\u5f71\u54cd\uff0c\u5305\u62ec\u7ecf\u6d4e\u80fd\u529b\u3001\u5e74\u9f84\u3001\u751f\u80b2\u504f\u597d\u7b49\u7b49\uff0c\u653f\u7b56\u5bf9\u751f\u80b2\u884c\u4e3a\u7684\u5f71\u54cd\u5e76\u6ca1\u6709\u539f\u6765\u9884\u60f3\u5f97\u5927\u3002

\u8001\u9f84\u5316\u52a0\u901f

\u6839\u636e\u56fd\u5bb6\u7edf\u8ba1\u5c40\u6570\u636e\u663e\u793a\uff0c\u4e2d\u56fd\u4eba\u53e3\u7684\u8001\u9f84\u5316\u7a0b\u5ea6\u6b63\u5728\u52a0\u901f\u52a0\u6df1\u30022017\u5e74\uff0c\u5168\u56fd\u4eba\u53e3\u4e2d60\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e324090\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768417.3%\uff0c\u5176\u4e2d65\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e315831\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768411.4%\u300260\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u548c65\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u90fd\u6bd4\u4e0a\u5e74\u589e\u52a0\u4e860.6\u4e2a\u767e\u5206\u70b9\u3002

\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd\u6301\u7eed\u964d\u4f4e\uff0c\u53bb\u5e7416\u81f359\u5468\u5c81\u7684\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u4e3a90199\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a64.9%\u30022016\u5e74\uff0c\u5168\u56fd\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u6570\u91cf\u4e3a90747\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a65.6%\u3002

\u7edf\u8ba1\u663e\u793a\uff0c\u53bb\u5e74\u4e2d\u56fd\u7684\u57ce\u9547\u5316\u901f\u5ea6\u5728\u6301\u7eed\u63d0\u9ad8\u3002\u57ce\u9547\u5e38\u4f4f\u4eba\u53e381347\u4e07\u4eba\uff0c\u6bd4\u4e0a\u5e74\u672b\u589e\u52a02049\u4e07\u4eba;\u4e61\u6751\u5e38\u4f4f\u4eba\u53e357661\u4e07\u4eba\uff0c\u51cf\u5c111312\u4e07\u4eba;\u57ce\u9547\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd(\u57ce\u9547\u5316\u7387)\u4e3a58.52%\uff0c\u6bd4\u4e0a\u5e74\u672b\u63d0\u9ad81.17\u4e2a\u767e\u5206\u70b9\u3002

"}, "cmtId": "D8GOCKJU0001899N"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/spiders/newsspider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import scrapy 5 | import re 6 | from scrapy.selector import Selector 7 | from crawl.items import NeteaseItem,TencentItem,SinaItem 8 | from scrapy.http import Request 9 | from urllib.request import urlopen 10 | from crawl.maziclib.news_fun import ListCombiner 11 | 12 | 13 | class NeteaseNewsSpider(scrapy.Spider): 14 | name = 'netease_news_spider' #最后要调用的名字 15 | start_urls = ['http://news.163.com'] 16 | allowed_domains = ['news.163.com'] 17 | 18 | url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/(\d+)/(\w+)\.html' 19 | 20 | def parse(self, response): # response即网页数据 21 | pat = re.compile(self.url_pattern) 22 | next_urls = re.findall(pat, str(response.body)) 23 | 24 | ###debug 25 | #article = next_urls[0][0]+'/'+next_urls[0][1]+'/'+next_urls[0][2]+'/'+next_urls[0][3]+'/'+next_urls[0][4]+'.html' 26 | #yield Request(article, callback=self.parse_news) 27 | ###debug 28 | 29 | for next_url in next_urls: 30 | article = next_url[0]+'/'+next_url[1]+'/'+next_url[2]+'/'+next_url[3]+'/'+next_url[4]+'.html' 31 | yield Request(article,callback=self.parse_news) 32 | 33 | def parse_news(self, response): 34 | item = NeteaseItem() 35 | selector = Selector(response) 36 | pattern = re.match(self.url_pattern, response.url) 37 | 38 | 39 | source = 'netease' 40 | date = '20'+pattern.group(2)+pattern.group(3) 41 | newsId = pattern.group(5) 42 | cmtId = pattern.group(5) 43 | 44 | productKey = re.findall(re.compile(r'"productKey" : "(\w+)"'), str(response.body))[0] 45 | comments_api = 'http://comment.news.163.com/api/v1/products/' + productKey + '/threads/' + newsId 46 | boardId = re.findall(r'"boardId":"(\w+)"',str(urlopen(comments_api).read()))[0] 47 | comments = ('http://comment.news.163.com/'+boardId+'/'+newsId+'.html') 48 | 49 | item['source'] = 'netease' 50 | item['date'] = date 51 | item['newsId'] = newsId 52 | item['cmtId'] = cmtId 53 | #item['boardId'] = boardId 54 | item['comments'] = {'link' : comments} 55 | item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''} 56 | item['contents']['title'] = selector.xpath('//*[@id="epContentLeft"]/h1/text()').extract() 57 | item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="endText"]/p').extract()) 58 | yield item 59 | 60 | 61 | 62 | 63 | class TencentNewsSpider(scrapy.Spider): 64 | name = 'tencent_news_spider' #最后要调用的名字 65 | start_urls = ['http://news.qq.com'] 66 | allowed_domains = ['news.qq.com'] 67 | 68 | #https://news.qq.com/a/20180120/000738.htm 69 | url_pattern = r'http://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm' 70 | 71 | def parse(self, response): # response即网页数据 72 | pat = re.compile(self.url_pattern) 73 | next_urls = re.findall(pat, str(response.body)) 74 | 75 | ### debug 76 | #article = 'http://'+next_urls[0][0]+'.qq.com/a/'+next_urls[0][1]+'/'+next_urls[0][2]+'.htm' 77 | #print(article) 78 | #yield Request(article,callback=self.parse_news) 79 | ### debug 80 | 81 | for next_url in next_urls: 82 | article = 'http://'+next_url[0]+'.qq.com/a/'+next_url[1]+'/'+next_url[2]+'.htm' 83 | yield Request(article,callback=self.parse_news) 84 | 85 | 86 | def parse_news(self, response): 87 | item = TencentItem() 88 | selector = Selector(response) 89 | url_pattern2 = r'(\w+)://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm' 90 | pattern = re.match(url_pattern2, str(response.url)) 91 | 92 | source = 'tencent' 93 | date = pattern.group(3) 94 | newsId = pattern.group(4) 95 | cmtId = re.findall(re.compile(r'cmt_id = (\d+);'), str(response.body))[0] 96 | comments = 'http://coral.qq.com/' + cmtId 97 | 98 | 99 | item['source'] = source 100 | item['date'] = date 101 | item['newsId'] = newsId 102 | item['comments'] = {'link' : comments} 103 | item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''} 104 | item['contents']['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract() 105 | item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="Cnt-Main-Article-QQ"]/p/text()').extract()) #这里要不要留下那些???(要不要/text()??) 106 | print("-------------------------------") 107 | print (date) 108 | print(newsId) 109 | print("-------------------------------") 110 | yield item 111 | 112 | 113 | 114 | 115 | 116 | 117 | --------------------------------------------------------------------------------