├── 2048
└── 2048.py
├── .gitignore
├── crawl
├── 暨南大学新闻爬虫
│ ├── jnuxshc
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── items.cpython-35.pyc
│ │ │ ├── __init__.cpython-35.pyc
│ │ │ └── settings.cpython-35.pyc
│ │ ├── spiders
│ │ │ ├── __pycache__
│ │ │ │ ├── xzhc.cpython-35.pyc
│ │ │ │ ├── __init__.cpython-35.pyc
│ │ │ │ └── csv_item_exporter.cpython-35.pyc
│ │ │ ├── __init__.py
│ │ │ ├── csv_item_exporter.py
│ │ │ └── xzhc.py
│ │ ├── pipelines.py
│ │ ├── items.py
│ │ ├── settings.py
│ │ └── middlewares.py
│ ├── main.py
│ ├── scrapy.cfg
│ └── readme.md
├── news
│ └── news_crawl
│ │ ├── crawl
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── items.cpython-35.pyc
│ │ │ ├── __init__.cpython-35.pyc
│ │ │ ├── pipelines.cpython-35.pyc
│ │ │ └── settings.cpython-35.pyc
│ │ ├── spiders
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-35.pyc
│ │ │ │ └── newsspider.cpython-35.pyc
│ │ │ ├── __init__.py
│ │ │ └── newsspider.py
│ │ ├── maziclib
│ │ │ ├── __pycache__
│ │ │ │ └── news_fun.cpython-35.pyc
│ │ │ └── news_fun.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── middlewares.py
│ │ ├── readme.md
│ │ ├── main.py
│ │ ├── scrapy.cfg
│ │ └── docs
│ │ ├── netease
│ │ ├── 20160602
│ │ │ └── BOIMS8PF00014JB5.json
│ │ ├── 20160721
│ │ │ └── BSH7V8QF00014JB6.json
│ │ ├── 20180116
│ │ │ └── D897H80K0001899O.json
│ │ ├── 20180119
│ │ │ ├── D8HD3PFD0001875P.json
│ │ │ ├── D8HLN6QA0001875P.json
│ │ │ ├── D8H1O67B0001899N.json
│ │ │ ├── D8HBI8IF0001875P.json
│ │ │ ├── D8HJ2GAK000187VE.json
│ │ │ ├── D8HAH1VS0001875P.json
│ │ │ ├── D8HIR5JP0001875P.json
│ │ │ ├── D8HJ6VRF0001875O.json
│ │ │ └── D8GOCKJU0001899N.json
│ │ └── 20180120
│ │ │ ├── D8J1VDAJ0001875P.json
│ │ │ └── D8IUD7L60001899O.json
│ │ └── tencent
│ │ ├── 20160418
│ │ └── 023091.json
│ │ ├── 20161227
│ │ ├── 012771.json
│ │ ├── 014055.json
│ │ ├── 007056.json
│ │ ├── 012170.json
│ │ └── 011065.json
│ │ ├── 20171009
│ │ └── 039986.json
│ │ ├── 20171129
│ │ └── 013590.json
│ │ └── 20180120
│ │ ├── 006763.json
│ │ ├── 002903.json
│ │ ├── 004328.json
│ │ ├── 003365.json
│ │ ├── 010551.json
│ │ ├── 006769.json
│ │ ├── 010301.json
│ │ ├── 009612.json
│ │ └── 004124.json
├── 简书首页爬虫
│ ├── tutotial
│ │ ├── tutotial
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-35.pyc
│ │ │ │ └── settings.cpython-35.pyc
│ │ │ ├── spiders
│ │ │ │ ├── __pycache__
│ │ │ │ │ └── __init__.cpython-35.pyc
│ │ │ │ ├── __init__.py
│ │ │ │ └── exampleSpider.py
│ │ │ ├── pipelines.py
│ │ │ ├── items.py
│ │ │ ├── settings.py
│ │ │ └── middlewares.py
│ │ ├── scrapy.cfg
│ │ └── readme.md
│ └── jian.csv
├── 百度提交关键词.py
├── baidu_search.py
├── search.py
└── getImage.py
├── 机器学习入门
├── 无监督
│ ├── readme.md
│ ├── cluster
│ │ ├── readme.md
│ │ ├── kmeans.py
│ │ └── city.txt
│ └── decomposition
│ │ ├── readme.md
│ │ └── PCA.py
├── keras
│ ├── load_exist_model.py
│ ├── my_model.h5
│ └── mnist.py
├── 强化学习
│ ├── readme.md
│ └── Flappy Bird.py
├── readme.md
├── tensorflow
│ ├── prac2.py
│ └── prac1.py
├── 监督
│ ├── readme.md
│ ├── 分类
│ │ ├── Bayes.py
│ │ ├── KNN.py
│ │ ├── DecisionTree.py
│ │ └── 人体运动状态信息评级.py
│ └── 回归
│ │ ├── prices.txt
│ │ └── 房价预测.py
├── matplotlib使用.py
├── Numpy.py
├── label_propagation.py
└── 标签传播算法(LP).py
├── python网络编程学习
├── chapter1.py
├── chapter2.py
├── chapter3.py
├── chapter4.py
├── chapter3-2.py
├── chapter3-3.py
└── chapter2 find.py
├── .idea
├── dictionaries
│ └── mazic.xml
├── vcs.xml
├── misc.xml
├── modules.xml
└── PycharmStudy.iml
├── grammar
├── readme.md
├── list.py
├── dictionary.py
├── set.py
├── Classes.py
├── Numpy
│ └── Arrays.py
└── liaoxuefeng.py
├── README.md
├── ACM
└── cf
│ ├── 672A 字符串第n个数.py
│ ├── 1A 简单数学.py
│ ├── 675A.py
│ ├── 227A 叉积.py
│ ├── 227B.py
│ ├── 208A 字符串.py
│ ├── 675B 填格子.py
│ └── 675E DP+greedy.py
├── OS平台编程
├── 遍历文件夹目录.py
├── 修改所有文件名字.py
└── 自动调用程序.py
├── 泰迪杯尝试
├── readability....py
├── re过滤html标签.py
├── 去除换行+空格.py
├── bbs.py
├── 1.py
├── 数据爬取(未处理).py
├── 爬取相似URL
│ ├── 3.所有小URL初步信息去标签.py
│ ├── 2.从相似URL中下载内容.py
│ └── 从主页获得相似URL初步可执行代码.py
├── README.md
├── pyquery取全体文本.py
└── 数据爬取(去标签).py
├── data structure
├── quickSort.py
└── bubble sort.py
├── xslt提取网页数据.py
├── 验证码处理
├── crack.py
└── ascii.py
└── Try cocos
└── HelloWorld.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.h5
--------------------------------------------------------------------------------
/crawl/暨南大学新闻爬虫/jnuxshc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawl/news/news_crawl/crawl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawl/简书首页爬虫/tutotial/tutotial/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawl/news/news_crawl/readme.md:
--------------------------------------------------------------------------------
1 | 主目录在这里
2 | 运行请在该目录调用`python3 main.py`
3 |
--------------------------------------------------------------------------------
/机器学习入门/无监督/readme.md:
--------------------------------------------------------------------------------
1 | 无监督两大主要任务
2 | - 聚类 cluster
3 | - 降维 decomposition
4 |
--------------------------------------------------------------------------------
/机器学习入门/keras/load_exist_model.py:
--------------------------------------------------------------------------------
1 | from keras.models import load_model
2 | model = load_model('my_model.h5')
--------------------------------------------------------------------------------
/python网络编程学习/chapter1.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter1.py
--------------------------------------------------------------------------------
/python网络编程学习/chapter2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter2.py
--------------------------------------------------------------------------------
/python网络编程学习/chapter3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3.py
--------------------------------------------------------------------------------
/python网络编程学习/chapter4.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter4.py
--------------------------------------------------------------------------------
/机器学习入门/keras/my_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/机器学习入门/keras/my_model.h5
--------------------------------------------------------------------------------
/python网络编程学习/chapter3-2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-2.py
--------------------------------------------------------------------------------
/python网络编程学习/chapter3-3.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazicwong/Some_Python_Project/HEAD/python网络编程学习/chapter3-3.py
--------------------------------------------------------------------------------
/.idea/dictionaries/mazic.xml:
--------------------------------------------------------------------------------
1 |
\u5404\u4f4d\u4eb2\u7231\u7684\u8bfb\u8005\u76c6\u53cb\u4eec\uff0c\u7f51\u6613\u65b0\u95fb\u9996\u9875\u65b0\u7248\u4e8e7\u67081\u65e5\u4e0a\u7ebf\u3002\u8fd9\u6b21\u6539\u7248\u5168\u9762\u5bf9\u63a5\u79fb\u52a8\u7aef\uff0c\u4e3a\u6ee1\u8db3\u7f51\u53cb\u7684\u9605\u8bfb\u4e60\u60ef\u548c\u9700\u6c42\uff0c\u65b0\u7248\u9875\u9762\u4e0e\u79fb\u52a8\u7aef\u4fdd\u6301\u4e00\u81f4\uff0c\u4f7f\u7528\u6237\r\n\u5728\u6d4f\u89c8PC\u7aef\u9875\u9762\u65f6\uff0c\u4e5f\u80fd\u50cf\u9605\u8bfb\u79fb\u52a8\u7aef\u65b0\u95fb\u4e00\u822c\u4fbf\u6377\u9ad8\u6548\u3002\u540c\u65f6\uff0c\u6211\u4eec\u6269\u5927\u9605\u8bfb\u754c\u9762\uff0c\u4f7f\u5f97\u5927\u5c4f\u5e55\u7684\u7535\u8111\u6709\u66f4\u5bbd\u5e7f\u7684\u53ef\u89c6\u7a7a\u95f4\uff0c\u65b9\u4fbf\u5927\u5bb6\u63a5\u6536\u66f4\u591a\u7684\u4fe1\u606f\u3002\u5404\u7c7b\u7b56\r\n\u5212\u90fd\u5f52\u4e8e\u5de6\u8fb9\u680f\uff0c\u65b9\u4fbf\u5927\u5bb6\u9605\u8bfb\u7f51\u6613\u72ec\u5bb6\u539f\u521b\u3002
\n
\u5f53\u7136\uff0c\u8fd9\u53ea\u662f\u5c0f\u7f16\u4eec\u7684\u60f3\u6cd5\uff0c\u9886\u5bfc\u8bf4\u4e86\u8fd8\u8981\u95ee\u95ee\u4f60\u4eec\u600e\u4e48\u770b\u3002\u6240\u4ee5\u6211\u4eec\u5c31\u51fa\u4e86\u4e00\u4e9b\u95ee\u9898\u8ba9\u4f60\u4eec\u56de\u7b54\u3002\u8fd8\u6709\u5176\u4ed6\u60f3\u6cd5\uff0c\u6b22\u8fce\u5728\u8ddf\u8d34\u91cc\u63d0\u51fa\u54e6\uff01
"}, "cmtId": "BOIMS8PF00014JB5"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/012771.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "012771", "comments": {"link": "http://coral.qq.com/1687685805"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012771.htm", "title": ["\u7f8e\u56fd\u597d\u5fc3\u4eba\u533f\u540d\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u63501.5\u5428\u725b\u6392"], "passage": "\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u897f\u96c5\u56fe\u4e00\u4e2a\u6148\u5584\u56e2\u4f53\u8054\u4eca\u5e74\u5723\u8bde\u8282\u4e3a\u65e0\u5bb6\u53ef\u5f52\u8005\u51c6\u5907\u7684\u83dc\u8272\u683c\u5916\u4e30\u5bcc\uff0c\u56e0\u4e3a\u4e00\u4f4d\u533f\u540d\u5584\u5fc3\u4eba\u58eb\u6350\u4e863500\u78c5(\u7ea61589\u516c\u65a4)\u7684\u808b\u773c\u725b\u6392\uff0c\u4e3a\u65e0\u5bb6\u53ef\u5f52\u7684\u6c11\u4f17\u8d34\u5fc3\u52a0\u83dc\u3002\u636e\u62a5\u9053\uff0c\u8be5\u56e2\u4f53\u4e3b\u53a8\u8d39\u96ea(Jordan Fisher)\u63a5\u53d7\u5a92\u4f53\u8bbf\u95ee\u65f6\u8868\u793a\uff0c\u4eca\u5e74\u5723\u8bde\u8282\u6536\u5230\u6709\u4eba\u6350\u8d60\u4e86\u9ad8\u8fbe3500\u78c5\u7684\u808b\u773c\u725b\u6392(rib-eye steak)\uff0c\u201c\u6211\u77e5\u9053\u7684\u65f6\u5019\uff0c\u5413\u4e86\u4e00\u5927\u8df3\u3002\u201d\u8d39\u96ea\u8bf4\uff1a\u201c\u8fd9\u662f\u5f88\u96be\u5f97\u7684\u4e8b\u3002\u50cf\u6211\u4eec\u8fd9\u6837\u7684\u673a\u6784\uff0c\u5e76\u4e0d\u4f1a\u5e38\u5e38\u78b0\u5230\u8fd9\u6837\u7684\u72b6\u51b5\u3002\u201d\u5728\u4eca\u5e74\u5723\u8bde\u8282\u5f53\u5929\uff0c\u524d\u5f80\u897f\u96c5\u56fe\u8be5\u6148\u5584\u56e2\u4f53\u6240\u5c5e\u6551\u6d4e\u7ad9\u5403\u996d\u7684\u6e38\u6c11\uff0c\u4e0d\u7ba1\u5927\u4eba\u6216\u5c0f\u5b69\uff0c\u6bcf\u4e2a\u4eba\u90fd\u5403\u5230\u4e86\u4e00\u4efd\u808b\u773c\u725b\u6392\u3002\u76f8\u5173\u4eba\u58eb\u8868\u793a\uff0c\u6350\u8d60\u725b\u6392\u7684\u597d\u5fc3\u4eba\u8981\u6c42\u533f\u540d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HD3PFD0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HD3PFD0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HD3PFD0001875P.html"}, "newsId": "D8HD3PFD0001875P", "contents": {"passage": "![]()
![]()
\n
\u3010\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb\uff1a\u903c\u6b7b\u6587\u79d1\u751f\u3011\u8fd1\u65e5\uff0c@\u5357\u4eac\u6797\u4e1a\u5927\u5b66 \u7684\u5b66\u751f\u5bbf\u820d\u95e8\u53e3\u8d34\u4e86\u4e00\u526f\u7279\u522b\u7684\u5bf9\u8054\uff0c\u5185\u5bb9\u7528\u5316\u5b66\u5143\u7d20\u5468\u671f\u8868\u91cc\u7684\u5143\u7d20\u7b26\u53f7\u62fc\u6210\uff0c\u7f51\u53cb\u76f4\u547c\u770b\u4e0d\u61c2\uff01\u636e\u6089\uff0c\u521b\u4f5c\u5bf9\u8054\u7684\u662f\u8be5\u6821\u751f\u7269\u4e0e\u73af\u5883\u5b66\u9662\u7684\u5927\u4e00\u5b66\u751f\u535e\u6b63\uff0c\u5bf9\u8054\u521b\u610f\u662f\u4ed6\u548c\u9ad8\u4e2d\u540c\u5b66\u4eec\u60f3\u51fa\u6765\u7684\u3002\u53ea\u770b\u56fe1\uff0c\u4f60\u80fd\u731c\u51fa\u662f\u4ec0\u4e48\u5417\uff1f
", "link": "http://news.163.com/18/0119/16/D8HD3PFD0001875P.html", "title": ["\u9ad8\u6821\u4e00\u5bbf\u820d\u8d34\u5316\u5b66\u5143\u7d20\u65b0\u6625\u5bf9\u8054 \u7f51\u53cb:\u903c\u6b7b\u6587\u79d1\u751f"]}} -------------------------------------------------------------------------------- /机器学习入门/无监督/cluster/city.txt: -------------------------------------------------------------------------------- 1 | 北京 2959.19 730.79 749.41 513.34 467.87 1141.82 478.42 457.64 2 | 天津 2459.77 495.47 697.33 302.87 284.19 735.97 570.84 305.08 3 | 河北 1495.63 515.90 362.37 285.32 272.95 540.58 364.91 188.63 4 | 山西 1406.33 477.77 290.15 208.57 201.50 414.72 281.84 212.10 5 | 内蒙古 1303.97 524.29 254.83 192.17 249.81 463.09 287.87 192.96 6 | 辽宁 1730.84 553.90 246.91 279.81 239.18 445.20 330.24 163.86 7 | 吉林 1561.86 492.42 200.49 218.36 220.69 459.62 360.48 147.76 8 | 黑龙江 1410.11 510.71 211.88 277.11 224.65 376.82 317.61 152.85 9 | 上海 3712.31 550.74 893.37 346.93 527.00 1034.98 720.33 462.03 10 | 江苏 2207.58 449.37 572.40 211.92 302.09 585.23 429.77 252.54 11 | 浙江 2629.16 557.32 689.73 435.69 514.66 795.87 575.76 323.36 12 | 安徽 1844.78 430.29 271.28 126.33 250.56 513.18 314.00 151.39 13 | 福建 2709.46 428.11 334.12 160.77 405.14 461.67 535.13 232.29 14 | 江西 1563.78 303.65 233.81 107.90 209.70 393.99 509.39 160.12 15 | 山东 1675.75 613.32 550.71 219.79 272.59 599.43 371.62 211.84 16 | 河南 1427.65 431.79 288.55 208.14 217.00 337.76 421.31 165.32 17 | 湖北 1783.43 511.88 282.84 201.01 237.60 617.74 523.52 182.52 18 | 湖南 1942.23 512.27 401.39 206.06 321.29 697.22 492.60 226.45 19 | 广东 3055.17 353.23 564.56 356.27 811.88 873.06 1082.82 420.81 20 | 广西 2033.87 300.82 338.65 157.78 329.06 621.74 587.02 218.27 21 | 海南 2057.86 186.44 202.72 171.79 329.65 477.17 312.93 279.19 22 | 重庆 2303.29 589.99 516.21 236.55 403.92 730.05 438.41 225.80 23 | 四川 1974.28 507.76 344.79 203.21 240.24 575.10 430.36 223.46 24 | 贵州 1673.82 437.75 461.61 153.32 254.66 445.59 346.11 191.48 25 | 云南 2194.25 537.01 369.07 249.54 290.84 561.91 407.70 330.95 26 | 西藏 2646.61 839.70 204.44 209.11 379.30 371.04 269.59 389.33 27 | 陕西 1472.95 390.89 447.95 259.51 230.61 490.90 469.10 191.34 28 | 甘肃 1525.57 472.98 328.90 219.86 206.65 449.69 249.66 228.19 29 | 青海 1654.69 437.77 258.78 303.00 244.93 479.53 288.56 236.51 30 | 宁夏 1375.46 480.89 273.84 317.32 251.08 424.75 228.73 195.93 31 | 新疆 1608.82 536.05 432.46 235.82 250.28 541.30 344.85 214.40 -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HLN6QA0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HLN6QA0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HLN6QA0001875P.html"}, "newsId": "D8HLN6QA0001875P", "contents": {"passage": "\n \uff08\u539f\u6807\u9898\uff1a\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff09\n
\u4e8b\u6545\u73b0\u573a
\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670819\u65e5\u6d88\u606f\uff0c\u4eca\u5929\u4e2d\u534812\u70b942\u5206\u5de6\u53f3\uff0c\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u6c5f\u5357\u6d88\u9632\u5927\u961f\u91d1\u78d0\u8def\u6d88\u9632\u4e2d\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff1a\u91d1\u534e\u5e02\u91d1\u4e1c\u533a\u591a\u6e56\u6c40\u6751\u6709\u623f\u5c4b\u53d1\u751f\u5012\u584c\u3002\u91d1\u534e\u5e02\u6d88\u9632\u652f\u961f\u7acb\u5373\u6307\u6d3e6\u8f6630\u4f4d\u6d88\u9632\u5b98\u5175\u8d76\u5f80\u73b0\u573a\u6551\u63f4\u3002\u521d\u6b65\u4f30\u8ba1\u516b\u4eba\u88ab\u538b\uff0c\u5176\u4e2d2\u4eba\u5f53\u573a\u6b7b\u4ea1\uff0c6\u4eba\u88ab\u9001\u5f80\u533b\u9662\u6551\u6cbb\u3002\u622a\u81f3\u76ee\u524d\uff0c\u6551\u63f4\u4ecd\u5728\u8fdb\u884c\u4e2d\u3002
\n
", "link": "http://news.163.com/18/0119/19/D8HLN6QA0001875P.html", "title": ["\u6d59\u6c5f\u91d1\u534e\u4e00\u5382\u623f\u5012\u584c8\u4eba\u88ab\u538b 2\u4eba\u5f53\u573a\u6b7b\u4ea1"]}} -------------------------------------------------------------------------------- /ACM/cf/675E DP+greedy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/22 15:35 4 | # @Author : mazicwong 5 | # @File : 675E DP+greedy.py 6 | 7 | ''' 8 | 英文: buy only tickets to stations from i+1 to ai inclusive (inclusive 表示包含在这个路段内的) 9 | 10 | 题意:有一个一条直线的地铁线路。给出a数组,在每个站点i只能买到去往[i+1, a[i]]内的票。 11 | 设p(i,j)为从i到j所需要的最少票数,求对所有ij的p(i,j)的和。(1== n): 57 | dp[i] = n - i 58 | else: 59 | x = argmax(que, a[i]) 60 | dp[i] = x - i + dp[x] + n - a[i] 61 | while (len(que) > 0 and que[-1]['a'] < a[i]): 62 | que.pop() 63 | que.append({'i': i, 'a': a[i]}) 64 | return sum(dp) 65 | 66 | 67 | n = int(input()) 68 | a = map(int, input().split(' ')) 69 | print(solve(n, a)) 70 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/014055.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "014055", "comments": {"link": "http://coral.qq.com/1687716811"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/014055.htm", "title": ["\u6cb3\u5357\u5c0f\u4f19\u627f\u5305\u5343\u8f86\u51fa\u79df\u8f66\u9876\u706f \u6253\u51fa\u6211\u7231\u4f60\u8868\u767d"], "passage": "\u8fd9\u8f86\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d12\u670825\u65e5\uff0c\u662f\u897f\u65b9\u7684\u5723\u8bde\u8282\uff0c\u8bb8\u591a\u60c5\u4fa3\uff0c\u9009\u62e9\u5728\u8fd9\u4e00\u5929\u8868\u767d\u3002\u5f53\u5929\uff0c\u8bb0\u8005\u7684\u670b\u53cb\u5708\u88ab\u8fd9\u6837\u7684\u7167\u7247\u5237\u5c4f\u4e86\uff0c\u5185\u5bb9\u4e3a\u201c\u90ed\u00d7\u00d7\u6211\u7231\u4f60\u201d\u201c\u4f60\u662f\u6211\u7684\u552f\u4e00\u201d\u7684\u8868\u767d\uff0c\u5728\u4fe1\u9633\u7684\u51fa\u79df\u8f66\u9876\u706f\u5c4f\u4e0a\u51fa\u73b0\u3002\u8fd9\u5219\u201c\u571f\u8c6a\u5f0f\u201d\u7684\u8868\u767d\uff0c\u5f15\u8d77\u4e0d\u5c11\u8fc7\u5f80\u8def\u4eba\u7684\u5173\u6ce8\uff0c\u4e0d\u5c11\u7f51\u53cb\u8868\u793a\u770b\u5230\u4e86\u8fd9\u5219\u8868\u767d\u3002\u7f51\u53cb\u7eb7\u7eb7\u8bc4\u8bba\uff1a\u201c\u8c01\u8fd9\u4e48\u571f\u8c6a\uff0c\u627f\u5305\u4e86\u51fa\u79df\u8f66\u9876\u706f\uff1f\u201d\u201c\u8fd9\u4f4d\u53eb\u90ed\u00d7\u00d7\u7684\u59b9\u5b50\u4e5f\u592a\u5e78\u798f\u4e86\u5427\u3002\u201d\u8fd9\u4e2a\u5c0f\u4f19\u7684\u8868\u767d\u4e5f\u5f97\u5230\u7f51\u53cb\u4e00\u81f4\u795d\u798f\u3002\u6628\u65e5\u4e0b\u5348\uff0c\u8bb0\u8005\u4e86\u89e3\u5230\uff0c\u4fe1\u9633\u5e02\u51fa\u79df\u8f66\u4e0a\u7684\u9876\u706f\u5c4f\u5e7f\u544a\u90fd\u662f\u7531\u4fe1\u9633\u67d0\u5bb6\u5e7f\u544a\u516c\u53f8\u7edf\u4e00\u8fd0\u8425\uff0c\u8fd9\u4f4d\u5c0f\u4f19\u4e00\u5171\u5305\u4e861000\u591a\u8f86\u51fa\u79df\u8f66\uff0c\u4ef7\u683c\u4e0a\u5343\u5143\u3002\u201c\u5728\u516c\u53f8\u5e72\u4e86\u8fd9\u4e48\u4e45\uff0c\u7b2c\u4e00\u6b21\u89c1\u8fd9\u6837\u7684\u4e8b\u60c5\uff0c\u8fd9\u5c0f\u4f19\u7684\u60f3\u6cd5\u592a\u65b0\u9896\u4e86\uff0c\u628a\u72d7\u7cae\u6492\u904d\u4e86\u5168\u57ce\u5440\u3002\u201d\u4e00\u540d\u7684\u54e5\u544a\u8bc9\u8bb0\u8005\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/006763.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "006763", "comments": {"link": "http://coral.qq.com/2369396685"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006763.htm", "title": ["\u6c55\u5934\u8b66\u65b9\uff1a\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u603b\u7ecf\u7406\u521d\u67e5\u4e3a\u610f\u5916\u5760\u4ea1 \u5c06\u8ffd\u8d23\u9020\u8c23\u8005"], "passage": "\u6c55\u5934\u5e02\u516c\u5b89\u5c40\u6f84\u6d77\u5206\u5c40\u5fae\u4fe1\u516c\u53f7\u201c\u5e73\u5b89\u6f84\u6d77\u201d2018\u5e741\u670818\u65e5\u6d88\u606f\uff1a1\u670818\u65e5\u51cc\u6668\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u63a5\u5e7f\u4e1c\u731b\u72ee\u5de5\u4e1a\u96c6\u56e2\u6709\u9650\u516c\u53f8\u7ba1\u7406\u4eba\u5458\u6797\u5fb7\u8d35\u62a5\u79f0\uff1a\u5176\u516c\u53f8\u603b\u7ecf\u7406\u9648\u4e50\u5f3a\u4e8e2018\u5e741\u67088\u65e5\u4e0d\u5e78\u901d\u4e16\uff0c\u8fd1\u671f\u7f51\u7edc\u4e0a\u51fa\u73b0\u5bf9\u9648\u4e50\u5f3a\u6b7b\u56e0\u6076\u610f\u4e2d\u4f24\u7684\u5fae\u535a\u548c\u89c6\u9891\u62a5\u9053\uff0c\u5bf9\u9648\u4e50\u5f3a\u7684\u58f0\u8a89\u548c\u516c\u53f8\u6b63\u5e38\u7ecf\u8425\u9020\u6210\u4e0d\u826f\u5f71\u54cd\uff0c\u5e76\u8981\u6c42\u4e25\u60e9\u9020\u8c23\u8005\u3002\u63a5\u62a5\u540e\uff0c\u6211\u5c40\u5e7f\u76ca\u6d3e\u51fa\u6240\u8fc5\u901f\u5f00\u5c55\u8c03\u67e5\u3002\u636e\u9648\u4e50\u5f3a\u5bb6\u5c5e\u53cd\u6620\uff0c\u6839\u636e\u65b0\u52a0\u5761\u8b66\u65b9\u544a\u77e5\u7684\u521d\u6b65\u8c03\u67e5\u7ed3\u679c\uff0c\uff0c\u6b63\u5f0f\u6b7b\u4ea1\u62a5\u544a\u8981\u7b49\u8b66\u65b9\u7ed3\u6848\u540e\uff0c\u62a5\u7ecf\u6cd5\u9662\u88c1\u51b3\u540e\u624d\u6b63\u5f0f\u901a\u77e5\u4e2d\u56fd\u9a7b\u65b0\u52a0\u5761\u5927\u4f7f\u9986\u3002\u9274\u4e8e\u8fd1\u671f\u7f51\u7edc\u5a92\u4f53\u4f20\u64ad\u9648\u4e50\u5f3a\u6b7b\u56e0\u53ca\u5176\u4ed6\u4fe1\u606f\u7684\u60c5\u51b5\uff0c\u8bf7\u5e7f\u5927\u7f51\u6c11\u4e0d\u8981\u4f20\u64ad\u672a\u7ecf\u6838\u5b9e\u7684\u4fe1\u606f\uff0c\u5bf9\u4e8e\u9020\u8c23\u3001\u4f20\u8c23\u6d89\u5acc\u8fdd\u6cd5\u7684\uff0c\u516c\u5b89\u673a\u5173\u5c06\u4f9d\u6cd5\u8ffd\u7a76\u76f8\u5173\u4eba\u5458\u7684\u6cd5\u5f8b\u8d23\u4efb\u3002\u76ee\u524d\uff0c\u6709\u5173\u60c5\u51b5\u6b63\u5728\u8fdb\u4e00\u6b65\u8c03\u67e5\u4e2d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180120/D8J1VDAJ0001875P.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8J1VDAJ0001875P", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8J1VDAJ0001875P.html"}, "contents": {"title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "link": "http://news.163.com/18/0120/07/D8J1VDAJ0001875P.html", "passage": "\n \uff08\u539f\u6807\u9898\uff1a\u5927\u96fe\u9ec4\u8272\u9884\u8b66 \u6c5f\u82cf\u5b89\u5fbd\u6cb3\u5357\u6e56\u5317\u7b49\u5730\u90e8\u5206\u5730\u533a\u6709\u6d53\u96fe\uff09\n
\u4e2d\u56fd\u5929\u6c14\u7f51\u8baf \u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff1a
\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002
![]()
\n
\u9632\u5fa1\u6307\u5357\uff1a
1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168;
2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002
"}, "cmtId": "D8J1VDAJ0001875P"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8H1O67B0001899N.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8H1O67B0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8H1O67B0001899N.html"}, "contents": {"title": ["\u7537\u5b50\u5230\u7a97\u53e3\u5904\u74064\u6761\u7f5a\u5355 \u529e\u4e8b\u5458\u5904\u7406\u4e00\u534a\u8bf4\"\u4e0b\u73ed\u4e86\""], "link": "http://news.163.com/18/0119/13/D8H1O67B0001899N.html", "passage": "\n\t\n\t
\u3010\u56db\u5f20\u7f5a\u5355\u5904\u7406\u4e24\uff0c\u529e\u4e8b\u5458\uff1a\u201c\u6211\u4e0b\u73ed\u4e86\u201d\u3011\u8fd1\u65e5\uff0c\u8d35\u5dde\u8d35\u9633\u7684\u8bb8\u5e08\u5085\u5230\u8f66\u7ba1\u6240\u529e\u7406\u8fdd\u7ae0\uff0c2\u670d\u52a1\u7a97\u53e3\u53ea\u5f001\u4e2a\u30024\u5c0f\u65f6\u540e\u8f6e\u5230\u4ed6\uff0c4\u6761\u8fdd\u7ae0\u521a\u529e2\u6761\uff0c\u529e\u4e8b\u5458\u8bf4\u201c\u6211\u8981\u4e0b\u73ed\u4e86\u201d\u3002\u5176\u95f4\uff0c\u5173\u95ed\u7684\u53e61\u4e2a\u7a97\u53e3\u5374\u4e3a\u201c\u719f\u4eba\u201d\u529e\u4e1a\u52a1\u3002
\n
"}, "cmtId": "D8H1O67B0001899N"} -------------------------------------------------------------------------------- /机器学习入门/keras/mnist.py: -------------------------------------------------------------------------------- 1 | 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense,Dropout,Activation 4 | from keras.optimizers import SGD 5 | from keras.datasets import mnist 6 | import numpy 7 | import h5py # save model 8 | 9 | ''' 10 | 第一步:选择模型 11 | ''' 12 | model = Sequential() 13 | 14 | ''' 15 | 第二步:构建网络层 16 | ''' 17 | model.add(Dense(500,input_shape=(784,))) # 输入层,28*28=784 (输入维度784,输出500个特征) 18 | model.add(Activation('tanh')) # 激活函数是tanh 19 | model.add(Dropout(0.5)) # 采用50%的dropout 20 | 21 | model.add(Dense(500)) # 隐藏层节点500个 22 | model.add(Activation('tanh')) 23 | model.add(Dropout(0.5)) 24 | 25 | model.add(Dense(10)) # 输出结果是10个类别,所以维度是10 26 | model.add(Activation('softmax')) # 最后一层用softmax作为激活函数 27 | 28 | ''' 29 | 第三步:编译 30 | ''' 31 | sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # 优化函数,设定学习率(lr)等参数 32 | model.compile(loss='categorical_crossentropy', optimizer=sgd) #, class_mode='categorical') # 使用交叉熵作为loss函数 33 | 34 | ''' 35 | 第四步:训练 36 | .fit的一些参数 37 | batch_size:对总的样本数进行分组,每组包含的样本数量 38 | epochs :训练次数 39 | shuffle:是否把数据随机打乱之后再进行训练 40 | validation_split:拿出百分之多少用来做交叉验证 41 | verbose:屏显模式 0:不输出 1:输出进度 2:输出每次的训练结果 42 | ''' 43 | (X_train, y_train), (X_test, y_test) = mnist.load_data() # 使用Keras自带的mnist工具读取数据(第一次需要联网) 44 | # 由于mist的输入数据维度是(num, 28, 28),这里需要把后面的维度直接拼起来变成784维 45 | X_train = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2]) 46 | X_test = X_test.reshape(X_test.shape[0], X_test.shape[1] * X_test.shape[2]) 47 | Y_train = (numpy.arange(10) == y_train[:, None]).astype(int) # 把index转换为一个one hot的矩阵 48 | Y_test = (numpy.arange(10) == y_test[:, None]).astype(int) # Y_test.shape 49 | 50 | model.fit(X_train,Y_train,batch_size=200,epochs=1,shuffle=True,verbose=1,validation_split=0.3) # loss 0.54 -> 0.22 51 | model.evaluate(X_test, Y_test, batch_size=200, verbose=1) 52 | 53 | ''' 54 | 第五步:输出 55 | ''' 56 | print("test set") 57 | scores = model.evaluate(X_test,Y_test,batch_size=200,verbose=0) 58 | print("") 59 | print("The test loss is %f" % scores) 60 | result = model.predict(X_test,batch_size=200,verbose=0) 61 | 62 | result_max = numpy.argmax(result, axis = 1) 63 | test_max = numpy.argmax(Y_test, axis = 1) 64 | 65 | result_bool = numpy.equal(result_max, test_max) 66 | true_num = numpy.sum(result_bool) 67 | print("") 68 | print("The accuracy of the model is %f" % (true_num/len(result_bool))) 69 | 70 | 71 | ''' 72 | 第六步:保存模型(可选) 73 | ''' 74 | # model.save('my_model.h5') 75 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/007056.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "007056", "comments": {"link": "http://coral.qq.com/1687570251"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/007056.htm", "title": ["\u6e56\u5357\u4e00\u5c0f\u5b66\u804c\u5de5\u7325\u4eb5\u5973\u751f \u5973\u5a7f\u7cfb\u8be5\u6821\u6559\u5bfc\u4e3b\u4efb"], "passage": "\u6e56\u5357\u90b5\u9633\u4e00\u5c0f\u5b66\u98df\u5802\u7537\u5b50\u7325\u4eb511\u5c81\u5973\u751f\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c4026\u65e522\u65f6\u8bb8\u53d1\u5e03\u901a\u62a5\u79f0\uff0c12\u670824\u65e5\uff0c\u90b5\u9633\u5e02\u516c\u5b89\u5c40\u53cc\u6e05\u5206\u5c40\u7834\u83b7\u4e00\u8d77\u7325\u4eb5\u513f\u7ae5\u6848\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u88ab\u4f9d\u6cd5\u91c7\u53d6\u5211\u4e8b\u5f3a\u5236\u63aa\u65bd\u3002\u901a\u62a5\u79f0\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u4eca\u5e7464\u5c81\uff0c\u5c0f\u5b66\u6587\u5316\uff0c\u65b0\u90b5\u53bf\u576a\u4e0a\u9547\u4eba\uff0c\u79df\u4f4f\u5728\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u67d0\u5c0f\u5b66\u5916\u67d0\u6c11\u623f\u5185\uff0c\u7cfb\u8be5\u5c0f\u5b66\u52e4\u6742\u5de5\u300212\u670824\u65e5\u4e0b\u5348\uff0c\u8be5\u5206\u5c40\u77f3\u6865\u6d3e\u51fa\u6240\u63a5\u5230\u62a5\u8b66\uff0c\u8f96\u533a\u67d0\u5c0f\u5b66\u5185\u6709\u4eba\u6253\u67b6\u3002\u6c11\u8b66\u8fc5\u901f\u8d76\u5230\u73b0\u573a\uff0c\u5c06\u53cc\u65b9\u5e26\u56de\u516c\u5b89\u673a\u5173\u8fdb\u884c\u8c03\u67e5\u3002\u7ecf\u67e5\uff0c\u5f53\u65e5\u5973\u751f\u5bb6\u957f\u5f97\u77e5\u5f53\u4e8b\u5973\u751f\u88ab\u5218\u67d0\u591a\u6b21\u7325\u4eb5\u540e\uff0c\u4fbf\u6765\u5230\u5b66\u6821\u627e\u5176\u7406\u8bba\uff0c\u53cc\u65b9\u53d1\u751f\u4e89\u6267\uff0c\u5218\u67d0\u906d\u5973\u751f\u5bb6\u5c5e\u6bb4\u6253\u3002\u7ecf\u5ba1\u8baf\uff0c\u72af\u7f6a\u5acc\u7591\u4eba\u5218\u67d0\u5bf9\u7325\u4eb5\u8be5\u5973\u751f\u7684\u72af\u7f6a\u4e8b\u5b9e\u4f9b\u8ba4\u4e0d\u8bb3\u3002\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c\u6d89\u6848\u7537\u5b50\u5218\u67d0\u7cfb\u90b5\u9633\u5e02\u53cc\u6e05\u533a\u4f58\u6e56\u5c0f\u5b66\u98df\u5802\u5de5\u4f5c\u4eba\u5458\uff0c\u6d89\u5acc\u7325\u4eb5\u8be5\u6821\u4e00\u540d11\u5c81\u7684\u4e94\u5e74\u7ea7\u5973\u751f\u3002\u5218\u67d0\u5728\u5b66\u6821\u5de5\u4f5c\u4e00\u5e74\u591a\u65f6\u95f4\uff0c\u5176\u5973\u5a7f\u662f\u4f58\u6e56\u5c0f\u5b66\u6559\u5bfc\u5904\u4e3b\u4efb\u3002\u4f58\u6e56\u5c0f\u5b66\u6821\u957f\u5f20\u98de\u8dc312\u670826\u65e5\u4e0b\u5348\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u901a\u62a5\u79f0\uff0c\u76ee\u524d\uff0c\u6848\u4ef6\u6b63\u5728\u8fdb\u4e00\u6b65\u4fa6\u67e5\u4e2d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/002903.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "002903", "comments": {"link": "http://coral.qq.com/2369176633"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/002903.htm", "title": ["\u4eca\u5e74\u6625\u8fd0\u56de\u7a0b\u706b\u8f66\u7968\u9996\u6b21\u6253\u6298 \u90e8\u5206\u56de\u7a0b\u7968\u6700\u4f4e8\u6298"], "passage": "2018\u5e74\u6625\u8fd0\u81ea2\u67081\u53f7\u5f00\u59cb\uff0c3\u670812\u53f7\u7ed3\u675f\uff0c\u517140\u5929\u3002\u4eca\u5e74\u6625\u8fd0\u671f\u95f4\uff0c\u94c1\u8def\u90e8\u95e8\u9996\u6b21\u5bf9\u90e8\u5206\u589e\u5f00\u7684\u5217\u8f66\u56de\u7a0b\u7968\u4ef7\u8bd5\u70b9\u6298\u6263\uff0c\u5728\u73b0\u884c\u7968\u4ef7\u57fa\u7840\u4e0a\u5b9e\u884c8~9\u6298\u4f18\u60e0\u3002\u7531\u4e8e\u6625\u8fd0\u5177\u6709\u5355\u65b9\u5411\u5ba2\u6d41\u7279\u70b9\uff0c\u90e8\u5206\u5217\u8f66\u53bb\u7a0b\u5ba2\u6d41\u96c6\u4e2d\u4f46\u8fd4\u7a0b\u5ba2\u6d41\u8f83\u5c11\u3002\u6b64\u6b21\u56de\u7a0b\u65b9\u5411\u90e8\u5206\u5217\u8f66\u8bd5\u70b9\u7968\u4ef7\u6253\u6298\uff0c\u4e3b\u8981\u56f4\u7ed5\u4eac\u6d25\u3001\u6caa\u676d\u3001\u5e7f\u6df13\u4e2a\u5730\u533a\u59cb\u53d1\u7ec8\u5230\u7684\u5217\u8f66\uff0c\u8282\u524d\u8282\u540e\u5206\u522b\u9009\u53d6\u4e8632\u8d9f\u5217\u8f66\u5b9e\u884c\u6253\u6298\u4f18\u60e0\u30022\u67081\u65e5\u81f32\u670815\u65e5\uff0c\u8282\u524d\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4eac\u6d25\u5730\u533a\u768412\u8d9f\uff1b\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6caa\u676d\u5730\u533a\u768411\u8d9f\uff1b\u6210\u6e1d\u3001\u6e56\u5357\u7b49\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u5e7f\u6df1\u5730\u533a\u76849\u8d9f\u30022\u670816\u65e5\u81f33\u670812\u65e5\uff0c\u8282\u540e\u56de\u7a0b\u65b9\u5411\u6253\u6298\u768432\u8d9f\u5217\u8f66\u5206\u522b\u4e3a\uff1a\u4eac\u6d25\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u4e1c\u5317\u3001\u6210\u6e1d\u3001\u5408\u961c\u7b49\u65b9\u541111\u8d9f\uff1b\u6caa\u676d\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6cb3\u5357\u3001\u897f\u5b89\u7b49\u65b9\u541110\u8d9f\uff1b\u5e7f\u6df1\u5730\u533a\u59cb\u53d1\uff0c\u7ec8\u5230\u6210\u6e1d\u3001\u6e56\u5357\u3001\u5357\u660c\u7b49\u65b9\u541111\u8d9f\u3002\u94c1\u8def\u90e8\u95e8\u63d0\u793a\uff0c\u65c5\u5ba2\u670b\u53cb\u53ef\u901a\u8fc7\u4e2d\u56fd\u94c1\u8def\u5ba2\u6237\u670d\u52a1\u4e2d\u5fc312306\u7f51\u7ad9\u67e5\u8be2\u5177\u4f53\u6298\u6263\u8f66\u6b21\u76f8\u5173\u4fe1\u606f\uff0c\u5408\u7406\u5b89\u6392\u51fa\u884c\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180120/D8IUD7L60001899O.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8IUD7L60001899O", "date": "20180120", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8IUD7L60001899O.html"}, "contents": {"title": ["\u7f8e\u53f8\u6cd5\u90e8\u5c06\u4ee5\u6b7b\u5211\u8d77\u8bc9\u7ae0\u83b9\u9896\u6848\u5acc\u72af \u5bb6\u5c5e\u8868\u793a\u6b23\u6170"], "link": "http://news.163.com/18/0120/06/D8IUD7L60001899O.html", "passage": "\n \uff08\u539f\u6807\u9898\uff1a\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\uff09\n
![]()
\u3010\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u51b3\u5b9a\u4ee5\u6b7b\u5211\u8d77\u8bc9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u3011\u7f8e\u56fd\u8054\u90a6\u653f\u5e9c\u4e8e\u5f53\u5730\u65f6\u95f4\u5468\u4e94\u4e0b\u5348\u53d1\u8868\u7531\u7f8e\u56fd\u53f8\u6cd5\u90e8\u90e8\u957f\u6770\u592b\u00b7\u585e\u7533\u65af\uff08Jeff Sessions\uff09\u7b7e\u7f72\u7684\u6587\u4ef6\uff0c\u51b3\u5b9a\u5bf9\u6d89\u5acc\u7ed1\u67b6\u81f4\u6b7b\u4e2d\u56fd\u8bbf\u95ee\u5b66\u8005\u7ae0\u83b9\u9896\u7684\u5acc\u72af\u5e03\u5170\u767b\u7279\u514b\u91cc\u65af\u6ed5\u68ee\uff08Brendt Christensen\uff09\u5bfb\u6c42\u6b7b\u5211\u3002\u8fd9\u4efd\u6587\u4ef6\u6307\u51fa\u6839\u636e\u5927\u966a\u5ba1\u56e22017\u5e7410\u67083\u65e5\u5bf9\u514b\u91cc\u65af\u6ed5\u68ee\u63d0\u51fa\u7684\u8ffd\u52a0\u8d77\u8bc9\u4e66\u5185\u5bb9 \u2014 \u5acc\u72af\u6545\u610f\u975e\u6cd5\u631f\u6301\u3001\u7981\u9522\u3001\u8bf1\u9a97\u3001\u7ed1\u67b6\u3001\u52ab\u6301\u7ae0\u83b9\u9896\u5e76\u6700\u7ec8\u5bfc\u81f4\u5176\u6b7b\u4ea1\uff0c \u4ee5\u6b7b\u5211\u8d77\u8bc9\u5acc\u72af\u662f\u5408\u7406\u7684\u3002\u7ae0\u83b9\u9896\u5bb6\u4eba\u7684\u4ee3\u7406\u5f8b\u5e08\u738b\u5fd7\u4e1c\u8868\u793a\uff0c\u5bb6\u4eba\u5bf9\u53f8\u6cd5\u90e8\u957f\u7684\u51b3\u5b9a\u8868\u793a\u6b23\u6170\uff0c\u611f\u8c22\u4ed6\u548c\u5f53\u5730\u68c0\u5bdf\u5b98\u8003\u8651\u5e76\u5c0a\u91cd\u5bb6\u4eba\u7684\u8bf7\u6c42\uff0c\u505a\u51fa\u4e86\u4e0e\u5bb6\u4eba\u610f\u613f\u76f8\u7b26\u7684\u51b3\u5b9a\u3002\u76ee\u524d\uff0c\u539f\u5b9a\u4e8e2\u670827\u65e5\u5f00\u5ba1\u7684\u65f6\u95f4\u4e0d\u53d8\u3002
"}, "cmtId": "D8IUD7L60001899O"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/004328.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "004328", "comments": {"link": "http://coral.qq.com/2369236201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004328.htm", "title": ["2018\u6625\u8282\u9ec4\u91d1\u5468\u653e\u5047\u53bb\u54ea\u73a9\uff1f\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u4e3a\u70ed\u95e8\u76ee\u7684\u5730"], "passage": "2018\u5e74\u7684\u6625\u8282\u4e00\u5929\u5929\u4e34\u8fd1\uff0c\u867d\u7136\u8fc7\u5e74\u56de\u5bb6\u662f\u4e2d\u56fd\u4eba\u7684\u4f20\u7edf\uff0c\u800c\u636e\u4e2d\u56fd\u65c5\u6e38\u7814\u7a76\u9662\u8c03\u67e5\u663e\u793a\uff0c\u4eca\u5e74\u6625\u8282\uff0c\u5927\u5bb6\u7684\u51fa\u6e38\u610f\u613f\u4e5f\u5f88\u5f3a\u70c8\u3002\u6570\u636e\u663e\u793a\uff0c2018\u5e74\u7b2c\u4e00\u5b63\u5ea6\u5c45\u6c11\u51fa\u6e38\u610f\u613f\u4e3a83%\uff0c\u800c\u9009\u62e9\u5728\u6625\u8282\u671f\u95f4\u51fa\u6e38\u7684\u6e38\u5ba2\u5360\u4e00\u5b63\u5ea6\u6e38\u5ba2\u768448.9%\uff0c\u7814\u5b66\u3001\u6d77\u5c9b\u6e38\u3001\u6e38\u8f6e\u6e38\u3001\u51b0\u96ea\u6e38\u3001\u4eb2\u5b50\u5bb6\u5ead\u6e38\u3001\u4e3b\u9898\u6e38\u5e02\u573a\u70ed\u5ea6\u8f83\u9ad8\u3002\u60a8\u4eca\u5e74\u6709\u4ec0\u4e48\u51fa\u6e38\u8ba1\u5212\u5417\uff1f\u6211\u53ef\u80fd\u4f1a\u53bb\u897f\u5b89\u90a3\u8fb9\uff0c\u56e0\u4e3a\u90a3\u8fb9\u53ef\u80fd\u5e74\u5473\u4f1a\u6bd4\u8f83\u91cd\u3002\u6211\u4e00\u822c\u60f3\u53bb\u4e09\u4e9a\uff0c\u56e0\u4e3a\u5317\u65b9\u7279\u522b\u51b7\uff0c\u5357\u65b9\u6bd4\u8f83\u70ed\uff0c\u6bd4\u8f83\u8212\u670d\u4e00\u70b9\u3002\u6625\u8282\u671f\u95f4\uff0c\u9009\u62e9\u56fd\u5185\u8de8\u7701\u5e02\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a65.9%\uff0c\u56fd\u5185\u70ed\u95e8\u57ce\u5e02\u5305\u62ec\u4e09\u4e9a\u3001\u54c8\u5c14\u6ee8\u3001\u676d\u5dde\u3001\u53a6\u95e8\u7b49\uff0c\u9009\u62e9\u8fd1\u90ca\u65c5\u6e38\u7684\u6bd4\u4f8b\u4e3a34.5%\u3002\u8c03\u67e5\u663e\u793a\uff0c\u5ea6\u5047\u4f11\u95f2\u3001\u89c2\u5149\u65c5\u6e38\u548c\u63a2\u9669\u662f\u5c45\u6c11\u6625\u8282\u51fa\u6e38\u7684\u4e3b\u8981\u52a8\u673a\u3002\u517b\u751f\u548c\u8fd0\u52a8\u4e3a\u4e3b\u7684\u5065\u5eb7\u6e38\u5c06\u6210\u4e3a\u4eca\u5e74\u7684\u65b0\u5ba0\uff0c\u65c5\u6e38\u53d1\u5c55\u6b63\u5728\u5411\u4e2d\u9ad8\u7ea7\u6f14\u5316\u3002\u5728\u5168\u57df\u65c5\u6e38\u65f6\u4ee3\uff0c\u90a3\u5b9e\u9645\u4e0a\u5e7f\u5927\u7684\u6e38\u5ba2\uff0c\u8d8a\u6765\u8d8a\u591a\u7684\u6e17\u900f\u5230\u65c5\u6e38\u76ee\u7684\u5730\u7684\u751f\u6d3b\u65b9\u5f0f\u548c\u4f11\u95f2\u7a7a\u95f4\u91cc\u53bb\u4e86\u3002\u5927\u5bb6\u8d8a\u6765\u8d8a\u5f3a\u8c03\u65c5\u6e38\u7684\u54c1\u8d28\u4e86\uff0c\u4e8b\u5b9e\u4e0a\u6211\u4eec2018\u5e74\u56fd\u5bb6\u65c5\u6e38\u5de5\u4f5c\u7684\u4e3b\u9898\u5c31\u662f\u4f18\u8d28\u65c5\u6e38\u5e74\u3002"}} -------------------------------------------------------------------------------- /机器学习入门/监督/分类/人体运动状态信息评级.py: -------------------------------------------------------------------------------- 1 | # SVM 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sklearn.preprocessing import Imputer #预处理模块 7 | from sklearn.model_selection import train_test_split #生成数据模块 8 | from sklearn.metrics import classification_report #评估模块 9 | # 导入分类器模块 10 | from sklearn.neighbors import KNeighborsClassifier 11 | from sklearn.tree import DecisionTreeClassifier 12 | from sklearn.naive_bayes import GaussianNB 13 | 14 | # 数据处理,传入特征列表,和标签列表 15 | def load_datasets(feature_paths, label_paths): 16 | feature = np.ndarray(shape=(0,41)) # 列41,特征维度41 (想象成一个41维的列向量) 17 | label = np.ndarray(shape=(0,1)) # 列1,标签维度1 18 | for file in feature_paths: 19 | file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/'+file 20 | df = pd.read_table(file, delimiter=',', na_values='?', header=None) 21 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) 22 | imp.fit(df) 23 | df = imp.transform(df) 24 | feature = np.concatenate((feature, df)) 25 | 26 | for file in label_paths: 27 | file = '~/Downloads/mooc课程数据/课程数据/分类/dataset/' + file 28 | df = pd.read_table(file, header=None) 29 | label = np.concatenate((label, df)) 30 | 31 | label = np.ravel(label) 32 | return feature, label 33 | 34 | 35 | if __name__ == '__main__': 36 | ''' 数据路径 ''' 37 | featurePaths = ['A/A.feature', 'B/B.feature', 'C/C.feature', 'D/D.feature', 'E/E.feature'] 38 | labelPaths = ['A/A.label', 'B/B.label', 'C/C.label', 'D/D.label', 'E/E.label'] 39 | ''' 读入数据 ''' 40 | x_train, y_train = load_datasets(featurePaths[:4], labelPaths[:4]) 41 | x_test, y_test = load_datasets(featurePaths[4:], labelPaths[4:]) 42 | x_train, x_, y_train, y_ = train_test_split(x_train, y_train, test_size=0.0) 43 | 44 | print('Start training knn') 45 | knn = KNeighborsClassifier().fit(x_train, y_train) 46 | print('Training done') 47 | answer_knn = knn.predict(x_test) 48 | print('Prediction done') 49 | 50 | print('Start training DT') 51 | dt = DecisionTreeClassifier().fit(x_train, y_train) 52 | print('Training done') 53 | answer_dt = dt.predict(x_test) 54 | print('Prediction done') 55 | 56 | print('Start training Bayes') 57 | gnb = GaussianNB().fit(x_train, y_train) 58 | print('Training done') 59 | answer_gnb = gnb.predict(x_test) 60 | print('Prediction done') 61 | 62 | print('\n\nThe classification report for knn:') 63 | print(classification_report(y_test, answer_knn)) 64 | print('\n\nThe classification report for DT:') 65 | print(classification_report(y_test, answer_dt)) 66 | print('\n\nThe classification report for Bayes:') 67 | print(classification_report(y_test, answer_gnb)) -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/012170.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "012170", "comments": {"link": "http://coral.qq.com/1687671711"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/012170.htm", "title": ["\u7f8e\u56fd\u7537\u5b50\u6b32\u9886\u517b\u732b\u54aa \u610f\u5916\u4e0e\u8d70\u5931\u6570\u6708\u7231\u732b\u91cd\u9022"], "passage": "\u8d44\u6599\u56fe\uff1a\u732b\u54aa\u3002\u4e2d\u65b0\u7f5112\u670827\u65e5\u7535 \u636e\u5916\u5a92\u62a5\u9053\uff0c\u7f8e\u56fd\u4f5b\u7f57\u91cc\u8fbe\u5dde\u4e00\u540d\u7537\u5b50\u7684\u7231\u732b\u8d70\u5931\u6570\u6708\uff0c\u65e5\u524d\u8fd9\u540d\u7537\u5b50\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u5230\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\uff0c\u6253\u7b97\u9886\u517b\u732b\u54aa\uff0c\u7ed3\u679c\u7adf\u7136\u610f\u5916\u4e0e\u7231\u732b\u56e2\u5706\u3002\u6770\u514b\u68ee\u7ef4\u5c14\u7684\u6d41\u6d6a\u52a8\u7269\u6536\u5bb9\u4e2d\u5fc3\u65e5\u524d\u5728\u793e\u4ea4\u7f51\u7ad9\u4e0a\u5206\u4eab\u540d\u53eb\u201c\u90a6\u90a6\u201d(Bon Bon)\u7684\u732b\u54aa\u4e0e\u4e3b\u4eba\u4e45\u522b\u91cd\u9022\u7684\u6545\u4e8b\uff0c\u83b7\u5f97\u7f51\u53cb\u70ed\u70c8\u56de\u54cd\u3002\u6536\u5bb9\u4e2d\u5fc3\u7684\u52a8\u7269\u534f\u4f1a\u8868\u793a\uff1a\u201c\u90a6\u90a6\u4ece\u4eca\u5e7410\u6708\u521d\u5c31\u5230\u6211\u4eec\u8fd9\u8fb9\u4e86\uff0c\u6211\u4eec\u4e0d\u77e5\u9053\u4e3a\u4ec0\u4e48\uff0c\u5bf9\u5b83\u6765\u8bf4\u4e00\u76f4\u5f88\u96be\u627e\u5230\u9886\u517b\u5bb6\u5ead\u3002\u539f\u6765\uff0c\u8fd9\u5f53\u4e2d\u6709\u4e2a\u975e\u5e38\u7279\u6b8a\u7684\u7406\u7531\u3002\u201d\u4e00\u540d\u5e74\u8f7b\u7537\u5b5021\u65e5\u5728\u53cb\u4eba\u966a\u540c\u4e0b\u6765\u5230\u8be5\u534f\u4f1a\uff0c\u6253\u7b97\u9886\u517b\u4e00\u53ea\u732b\u54aa\uff0c\u56e0\u4e3a\u4ed6\u5fc3\u7231\u7684\u732b\u54aa\u51e0\u4e2a\u6708\u4e4b\u524d\u8d70\u4e22\u4e86\uff0c\u4ed6\u60f3\u8981\u518d\u627e\u4e00\u53ea\u732b\u54aa\u6765\u966a\u4f34\u3002\u7ed3\u679c\uff0c\u8fd9\u540d\u7537\u5b50\u5728\u6536\u5bb9\u4e2d\u5fc3\u7684\u6240\u6709\u732b\u54aa\u5f53\u4e2d\uff0c\u53d1\u73b0\u4e86\u4e00\u53ea\u5bb3\u7f9e\u7684\u6df1\u8272\u5c0f\u732b\uff0c\u770b\u8d77\u6765\u8ddf\u8d70\u4e22\u597d\u51e0\u4e2a\u6708\u7684\u7231\u732b\u957f\u5f97\u5f88\u50cf\uff0c\u7ed3\u679c\u67e5\u8bc1\u4e4b\u4e0b\uff0c\u53d1\u73b0\u88ab\u5de5\u4f5c\u4eba\u5458\u53d6\u540d\u4e3a\u201c\u90a6\u90a6\u201d\u7684\u8fd9\u53ea\u6bcd\u732b\uff0c\u539f\u6765\u5c31\u81ea\u5df1\u517b\u7684\u201c\u5bc6\u65af\u8482\u201d\u3002\u5de5\u4f5c\u4eba\u5458\u8868\u793a\uff0c\u201c\u90a6\u90a6\u201d\u8d70\u5931\u4e0d\u4e45\uff0c\u5c31\u88ab\u70ed\u5fc3\u6c11\u4f17\u6361\u5230\uff0c\u9001\u6765\u6536\u5bb9\u4e2d\u5fc3\uff0c\u201c\u73b0\u5728\u5b83\u7ec8\u4e8e\u53ef\u4ee5\u56de\u5bb6\uff0c\u56e2\u5706\u8fc7\u8282\uff0c\u56de\u5230\u771f\u6b63\u5c5e\u4e8e\u5b83\u7684\u5bb6\u3002\u201d"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HBI8IF0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HBI8IF0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HBI8IF0001875P.html"}, "newsId": "D8HBI8IF0001875P", "contents": {"passage": "\n \uff08\u539f\u6807\u9898\uff1a\u4ea4\u901a\u8fd0\u8f93\u90e8\uff1a\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa\uff09\n
![]()
\n
\u4e2d\u9752\u5728\u7ebf\u5317\u4eac1\u670819\u65e5\u7535 \u4eca\u5929\u4e0b\u5348\uff0c\u4ea4\u901a\u8fd0\u8f93\u90e8\u53ec\u5f00\u201c\u6851\u5409\u201d\u8f6e\u78b0\u649e\u71c3\u7206\u4e8b\u6545\u65b0\u95fb\u53d1\u5e03\u4f1a\u3002
\u4e2d\u56fd\u6d77\u4e0a\u641c\u6551\u4e2d\u5fc3\u526f\u4e3b\u4efb\u3001\u4ea4\u901a\u8fd0\u8f93\u90e8\u5e94\u6025\u529e\u4e3b\u4efb\u667a\u5e7f\u8def\u8868\u793a\uff0c\u8fd9\u6b21\u5e94\u6025\u6551\u63f4\u5de5\u4f5c\u96be\u5ea6\u5f88\u9ad8\uff0c\u4e16\u754c\u822a\u8fd0\u53f2\u4e0a\u5c1a\u65e0\u6cb9\u8239\u8f7d\u8fd0\u201c\u51dd\u6790\u6cb9\u201d\u88ab\u649e\u5931\u706b\u7684\u4e8b\u6545\u53d1\u751f\uff0c\u201c\u5e94\u6025\u5904\u7f6e\u65e0\u5148\u4f8b\u53ef\u5faa\u3002\u201d
2018\u5e741\u67086\u65e5\u665a\uff0c\u5df4\u62ff\u9a6c\u7c4d\u6cb9\u8239\u201c\u6851\u5409\u201d\u8f6e\u4e0e\u4e2d\u56fd\u9999\u6e2f\u7c4d\u6563\u8d27\u8239\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u5728\u957f\u6c5f\u53e3\u4ee5\u4e1c\u7ea6160\u6d77\u91cc\u5904\u53d1\u751f\u78b0\u649e\u3002\u4e8b\u6545\u5bfc\u81f4\u201c\u6851\u5409\u201d\u8f6e\u8d27\u8239\u8d77\u706b\uff0c32\u540d\u8239\u5458\u5931\u8e2a\uff0c\u201c\u957f\u5cf0\u6c34\u6676\u201d\u8f6e\u53d7\u635f\u8d77\u706b\uff0c21\u540d\u8239\u5458\u5f03\u8239\u9003\u751f\u540e\u88ab\u9644\u8fd1\u6e14\u8239\u6551\u8d77\u3002
", "link": "http://news.163.com/18/0119/16/D8HBI8IF0001875P.html", "title": ["\u4ea4\u901a\u90e8:\"\u6851\u5409\"\u6cb9\u8f6e\u4e8b\u6545\u6551\u63f4\u96be\u5ea6\u9ad8 \u6ca1\u6709\u5148\u4f8b\u53ef\u5faa"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HJ2GAK000187VE.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HJ2GAK000187VE", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8HJ2GAK000187VE.html"}, "newsId": "D8HJ2GAK000187VE", "contents": {"passage": "\n \uff08\u539f\u6807\u9898\uff1a\u53a6\u822a\u5c31\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed\u53d1\u58f0\uff1a\u4e25\u91cd\u5f71\u54cd\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\uff09\n
@\u53a6\u95e8\u822a\u7a7a\u5b98\u65b9\u5fae\u535a1\u670819\u65e5\u6d88\u606f\uff0c\u6625\u8282\u662f\u4e2d\u534e\u6c11\u65cf\u6700\u91cd\u8981\u7684\u4f20\u7edf\u8282\u65e5\u3002\u4e3a\u4e86\u6ee1\u8db32018\u5e74\u6625\u8282\u671f\u95f4\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u7684\u9700\u6c42\uff0c\u53a6\u95e8\u822a\u7a7a\u6309\u7167\u60ef\u4f8b\uff0c\u7279\u522b\u8c03\u6574\u8fd0\u529b\uff0c\u7533\u8bf7\u589e\u52a070\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\uff0c\u4e3b\u8981\u5305\u62ec\u4ece\u798f\u5dde\u3001\u53a6\u95e8\u3001\u676d\u5dde\u5f80\u8fd4\u53f0\u6e7e\u7684\u822a\u73ed\uff0c\u8ba9\u53f0\u6e7e\u540c\u80de\u53ef\u4ee5\u901a\u8fc7\u6700\u4fbf\u6377\u7684\u65b9\u5f0f\u5f80\u8fd4\u4e24\u5cb8\uff0c\u6b22\u5ea6\u65b0\u6625\u4f73\u8282\u3002\u76ee\u524d\u5df2\u6709\u8d85\u8fc71\u4e07\u540d\u65c5\u5ba2\u9884\u8ba2\u76f8\u5173\u822a\u73ed\u673a\u7968\uff0c\u9884\u8ba1\u6625\u8282\u671f\u95f4\u5c06\u6709\u8d85\u8fc72\u4e07\u540d\u65c5\u5ba2\u4e58\u5750\u53a6\u822a\u4e24\u5cb8\u52a0\u73ed\u822a\u73ed\u3002
\u76ee\u524d\uff0c\u53d7\u53f0\u6e7e\u65b9\u9762\u5e72\u9884\uff0c\u53a6\u822a\u6839\u636e\u4e24\u5cb8\u5e02\u573a\u9700\u6c42\u7533\u8bf7\u768470\u73ed\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u673a\u53ef\u80fd\u65e0\u6cd5\u6267\u884c\uff0c\u8fd9\u5c06\u4e25\u91cd\u5f71\u54cd\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u4e0e\u4eb2\u4eba\u56e2\u805a\u7684\u884c\u7a0b\u5b89\u6392\u3002\u6b64\u4e3e\u7ed9\u822a\u4f01\u9020\u6210\u7684\u7ecf\u6d4e\u635f\u5931\u4e8b\u5c0f\uff0c\u7ed9\u4e24\u5cb8\u6c11\u4f17\u5f80\u6765\u5e26\u6765\u7684\u6781\u5927\u4e0d\u4fbf\u4e8b\u5927\u3002
\u53a6\u822a\u81ea\u6210\u7acb\u4ee5\u6765\u4fbf\u4ee5\u201c\u670d\u52a1\u4e24\u5cb8\u201d\u4e3a\u4f7f\u547d\uff0c\u6210\u4e3a\u4e24\u5cb8\u76f4\u822a\u7684\u53c2\u4e0e\u8005\u3001\u89c1\u8bc1\u8005\u548c\u63a8\u8fdb\u8005\uff0c\u5728\u6d77\u5ce1\u4e24\u5cb8\u4e4b\u95f4\u67b6\u8d77\u4e86\u4fbf\u6377\u7684\u7a7a\u4e2d\u6865\u6881\u3002\u5728\u6b64\u5f3a\u70c8\u547c\u5401\u53f0\u6e7e\u6709\u5173\u90e8\u95e8\u80fd\u591f\u987a\u5e94\u6c11\u610f\uff0c\u6ee1\u8db3\u6c11\u4f17\u8feb\u5207\u9700\u6c42\uff0c\u4e3a\u4f17\u591a\u53f0\u6e7e\u540c\u80de\u8fd4\u4e61\u8fc7\u5e74\u63d0\u4f9b\u4fbf\u5229\u3002
\n
", "link": "http://news.163.com/18/0119/18/D8HJ2GAK000187VE.html", "title": ["\u53a6\u822a\u56de\u5e94\u53f0\u6e7e\u9650\u5236\u4e24\u5cb8\u6625\u8282\u52a0\u73ed\u822a\u73ed:\u4e25\u91cd\u5f71\u54cd\u8fd4\u4e61"]}} -------------------------------------------------------------------------------- /泰迪杯尝试/爬取相似URL/从主页获得相似URL初步可执行代码.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 22:50 4 | # @Author : mazicwong 5 | # @File : 1.爬取相似url(最终).py 6 | 7 | import urllib.request 8 | import re 9 | import os 10 | from bs4 import BeautifulSoup 11 | 12 | 13 | # 获得主页html 14 | def get_root_html(url): 15 | # 在主页下面get新的html 16 | headers = { 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 18 | } 19 | req = urllib.request.Request(url=url, headers=headers) 20 | response = urllib.request.urlopen(req, timeout=2) 21 | html = response.read() 22 | return html 23 | 24 | 25 | def get_re(url): 26 | url = url[7:] # 去除http:// 27 | Len = len(url) 28 | p = "http://" 29 | i = 0 30 | while i < Len: 31 | if url[i] == '.': 32 | p += '.' 33 | elif 'a' <= url[i] <= 'z': # 不能直接判isplpha,因为str[i]中全都是字符 34 | p += '[a-z]' 35 | elif '0' <= url[i] <= 'z': 36 | p += '\d' 37 | else: 38 | p += url[i] 39 | i += 1 40 | return p 41 | 42 | 43 | # 获取该url数据,分为获取本身和相似url 44 | def main(): 45 | with open(r"E:\泰迪杯\C题样例数据\All_html 相似url\bbs_urls.txt", "r") as file: 46 | urlList = file.readlines() 47 | cnt = 1 48 | # path = r'E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果' #用来判断文件是否已经存在 49 | for url in urlList: 50 | # if os.path.isfile('out%s.txt'%cnt): #存在且不为空就退出 51 | # if os. 52 | # cnt +=1 53 | # continue 54 | 55 | # 以下:get主页url http://www.baidu.com/abc/cc ==>> www.baidu.com 56 | m = url.split('//') 57 | if len(m) == 2: 58 | root_url = m[1] 59 | else: 60 | root_url = m[0] 61 | tt = root_url.split('/') 62 | root_url = tt[0] 63 | root_url = r'http://' + root_url 64 | # getHtml(url, cnt) 65 | # print(root_url) 66 | root_html = get_root_html(root_url) # 获得主页html 67 | p1 = get_re(url) # 获取正则表达式 68 | # print(p1) 69 | # print(type(p1)) 70 | p1 = p1.encode(encoding='utf-8') # it can help transfer the "string" to "bytes" 71 | p1 = p1[:-1] #去掉换行符 72 | # print(p1) 73 | # print(type(p1)) 74 | pat = re.compile(p1) # 编译正则表达式 75 | List = re.findall(pat, root_html) 76 | print(len(List)) 77 | # for i in List: 78 | # print(i) 79 | path = r"E:\泰迪杯\C题样例数据\All_html 相似url\爬取结果\out%s.txt" % cnt 80 | with open(path, "w") as f: 81 | for i in List: 82 | i = i.decode() 83 | i = str(i) 84 | f.write(i) 85 | f.write('\n') 86 | cnt += 1 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/003365.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "003365", "comments": {"link": "http://coral.qq.com/2369196525"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/003365.htm", "title": ["\u6e56\u5357\u4e00\u5973\u533b\u751f\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u83b7\u8d5e\u201c\u6700\u7f8e\u201d\uff1a\u5c0f\u75c5\u90fd\u575a\u6301"], "passage": "\u6e56\u5357\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u4e00\u5973\u533b\u751f1\u670818\u65e5\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\uff0c\u83b7\u8d5e\u201c\u5b81\u4e61\u6700\u7f8e\u533b\u751f\u201d\u3002\u5f53\u4e8b\u533b\u751f\u7a0b\u52291\u670819\u65e5\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\uff0c\u201c\u50cf\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u6211\u4eec\u5de5\u4f5c\uff0c\u6211\u4eec\u57fa\u672c\u90fd\u4f1a\u575a\u6301\u4e0a\u73ed\u3002\u201d\u636e\u4e86\u89e3\uff0c\u7a0b\u5229\u662f\u5b81\u4e61\u5e02\u4eba\u6c11\u533b\u9662\u6d41\u6c99\u5206\u9662\u5987\u4ea7\u79d1\u4e3b\u4efb\uff0c\u5728\u8fd9\u91cc\u5df2\u7ecf\u5de5\u4f5c\u4e86\u4e03\u5e74\u30021\u670818\u65e5\uff0c\u56e0\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u4f46\u53c8\u8f6e\u5230\u503c\u73ed\uff0c\u4e8e\u662f\u5979\u8fb9\u6253\u70b9\u6ef4\u8fb9\u770b\u75c5\u3002\u636e\u6e56\u5357\u7ecf\u89c6\u6b64\u524d\u62a5\u9053\uff0c\u7a0b\u5229\u5de6\u624b\u6253\u7740\u70b9\u6ef4\uff0c\u53f3\u624b\u62ff\u7740\u7b14\u5199\u5b57\uff0c\u5750\u5728\u529e\u516c\u684c\u524d\u7ed9\u75c5\u4eba\u770b\u75c5\u3002\u6b64\u5916\uff0c\u7a0b\u5229\u4f1a\u7528\u53f3\u624b\u4e3e\u7740\u70b9\u6ef4\u74f6\uff0c\u7136\u540e\u5230\u75c5\u623f\u53bb\u67e5\u623f\uff0c\u8be2\u95ee\u60a3\u8005\u60c5\u51b5\u3002\u4e00\u4f4d\u60a3\u8005\u8bf4\uff1a\u201c\u5979\u4e00\u76f4\u575a\u6301\u5728\u8fd9\u8fb9\uff0c\u4e3a\u6211\u4eec\u75c5\u4eba\u7740\u60f3\uff0c\u6211\u89c9\u5f97\u5979\u662f\u5b81\u4e61\u6700\u7f8e\u7684\u533b\u751f\u3002\u201d\u201c\u56e0\u4e3a\u8eab\u4f53\u6709\u70b9\u4e0d\u8212\u9002\uff0c\u6211\u5df2\u7ecf\u6253\u4e86\u56db\u5929\u7684\u70b9\u6ef4\u3002\u521a\u597d\u8fd9\u51e0\u5929\uff0c\u6211\u4eec\u79d1\u5ba4\u6bd4\u8f83\u5fd9\uff0c\u6709\u4e00\u4f4d\u4ea7\u540e\u5927\u51fa\u8840\u7684\u90fd\u5728\u6211\u4eec\u8fd9\u91cc\u62a2\u6551\uff0c\u7d2f\u8fd8\u662f\u6bd4\u8f83\u7d2f\u3002\u6211\u4eec\u5728\u57fa\u5c42\u4e0a\u73ed\uff0c\u4eba\u5458\u90fd\u6bd4\u8f83\u7d27\u5f20\uff0c\u5206\u5de5\u4e5f\u4e0d\u90a3\u4e48\u7ec6\u5316\uff0c\u8981\u505a\u7684\u4e8b\u60c5\u5f88\u591a\uff0c\u50cf\u6211\u4eec\u8fd9\u79cd\u5c0f\u75c5\uff0c\u53ea\u8981\u4e0d\u5f71\u54cd\u5de5\u4f5c\uff0c\u6211\u4eec\u8fd8\u662f\u4f1a\u575a\u6301\u4e0a\u73ed\u7684\u3002\u201d\u7a0b\u5229\u8bf4\u3002\u5bf9\u4e8e\u83b7\u8d5e\u201c\u6700\u7f8e\u533b\u751f\u201d\u79f0\u53f7\uff0c\u7a0b\u5229\u8868\u793a\uff1a\u201c\u6700\u7f8e\u533b\u751f\u771f\u7684\u4e0d\u6562\u5f53\uff0c\u6bcf\u4e00\u4e2a\u804c\u4e1a\u90fd\u6709\u804c\u4e1a\u7684\u672c\u80fd\uff0c\u6211\u4eec\u4e34\u5e8a\u6709\u597d\u591a\u8fd9\u6837\u7684\u533b\u751f\uff0c\u575a\u6301\u4ee5\u75c5\u4eba\u4e3a\u672c\uff0c\u5162\u5162\u4e1a\u4e1a\uff0c\u606a\u5b88\u5c97\u4f4d\u3002\u201d"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/010551.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "010551", "comments": {"link": "http://coral.qq.com/2369832377"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010551.htm", "title": ["\u5e7f\u897f\u5317\u6d77\u8054\u5408\u884c\u52a8\u961f\u6293\u83b747\u540dA\u7ea7\u4f20\u9500\u5934\u76ee \u51bb\u7ed3\u8d854\u5343\u4e07\u5143"], "passage": "\u6628\u5929\uff0819\u65e5\uff09\u51cc\u66683\u70b9\uff0c\u5728\u5e7f\u897f\u5317\u6d77\u5e02\uff0c\u7531\u516c\u5b89\u3001\u5de5\u5546\u3001\u57ce\u7ba1\u7b49\u90e8\u95e8680\u4f59\u540d\u6267\u6cd5\u4eba\u5458\u7ec4\u6210\u8054\u5408\u884c\u52a8\u961f\uff0c\u91cd\u70b9\u6e05\u67e5\u6d89\u5acc\u7ec4\u7ec7\u4f20\u9500\u6d3b\u52a8\u7684\u5934\u76ee\u548c\u4f20\u9500\u9aa8\u5e72\u5206\u5b50\u3001\u53c2\u52a0\u201c\u8d44\u672c\u8fd0\u4f5c\u201d\u3001\u201c\u4e00\u65e5\u6e38\u201d\u7b49\u4f20\u9500\u6d3b\u52a8\u7684\u6d89\u4f20\u4eba\u5458\uff0c\u6b64\u6b21\u4e13\u9879\u884c\u52a8\u5171\u6293\u83b7A\u7ea7\u53ca\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff0c\u51bb\u7ed3\u8d44\u91d1\u7ea64200\u4e07\u5143\u3002\u5f53\u5929\u51cc\u6668\uff0c\u6267\u6cd5\u4eba\u5458\u8fdb\u5165\u5317\u6d77\u5e02\u590f\u65e5\u6d77\u6e7e\u5c0f\u533a\uff0c\u5bf9\u524d\u671f\u6478\u6392\u51fa\u7684\u6d89\u5acc\u4ece\u4e8b\u4f20\u9500\u6d3b\u52a8\u7684100\u591a\u4e2a\u623f\u95f4\u8fdb\u884c\u6e05\u67e5\u6574\u6cbb\u884c\u52a8\u3002\u6b64\u5916\uff0c\u6267\u6cd5\u4eba\u5458\u8fd8\u5206\u522b\u5bf9\u5317\u6d77\u5e02\u533a\u7684\u5317\u6d77\u5723\u7687\u5e7f\u573a\u3001\u6850\u6d0b\u65b0\u57ce\u4e24\u4e2a\u5c0f\u533a\u8fdb\u884c\u6e05\u67e5\u884c\u52a8\u3002\u5171\u6e05\u67e5\u51fa\u79df\u5c4b80\u591a\u95f4\uff0c\u67e5\u83b7\u6d89\u5acc\u4f20\u9500\u4eba\u5458100\u591a\u540d\uff0c\u4ee5\u53ca\u4e00\u6279\u6d89\u5acc\u4f20\u9500\u8fdd\u6cd5\u884c\u4e3a\u7684\u4e66\u7c4d\u548c\u7269\u54c1\u3002\u57fa\u672c\u6bcf\u4e00\u4e2a\u623f\u95f4\u6211\u4eec\u90fd\u6e05\u67e5\u51fa\u6d89\u4f20\u4eba\u5458\uff0c\u4e24\u4e2a\u5c0f\u533a\u4e00\u5171\u6e05\u67e5\u4e86100\u591a\u4e2a\u6d89\u4f20\u4eba\u5458\uff0c\u4e0b\u4e00\u6b65\u6211\u4eec\u6839\u636e\u72af\u7f6a\u7684\uff0c\u6d89\u53ca\u7ec4\u7ec7\u9886\u5bfc\u4f20\u9500\u7f6a\u7684 \u4f9d\u6cd5\u6253\u51fb\uff0c\u6e05\u67e5\u6ca1\u6709\u6784\u6210\u72af\u7f6a\u7684\u6211\u4eec\u7ecf\u8fc7\u6559\u80b2\u3001\u8bad\u8beb\u7136\u540e\u505a\u5176\u4ed6\u76f8\u5e94\u7684\u5904\u7406\u3002\u636e\u4e86\u89e3\uff0c\u5728\u8fd9\u6b21\u4e13\u9879\u884c\u52a8\u4e2d\uff0c\u6267\u6cd5\u4eba\u5458\u9664\u4e86\u5bf9\u4f20\u9500\u4eba\u5458\u805a\u96c6\u8f83\u591a\u7684\u5c0f\u533a\u8fdb\u884c\u5730\u6bef\u5f0f\u6e05\u67e5\u5916\uff0c\u8fd8\u7ec4\u7ec7\u8b66\u529b\u5206\u522b\u5728\u5185\u8499\u53e4\u3001\u4e91\u5357\u3001\u5e7f\u897f\u540c\u65f6\u8fdb\u884c\u6293\u6355\u884c\u52a8\u3002\u622a\u81f3\u6628\u5929\uff0819\u65e5\uff09\u4e0a\u5348\uff0c\u5df2\u6293\u83b7A\u7ea7\u4ee5\u4e0a\u4f20\u9500\u5934\u76ee47\u540d\uff1b\u5317\u6d77\u5e02\u5171\u6e05\u67e5\u51fa\u79df\u5c4b212\u95f4\uff0c\u67e5\u5c01\u6d89\u4f20\u51fa\u79df\u5c4b148\u6237\uff0c\u67e5\u83b7\u6d89\u4f20\u4eba\u5458459\u540d\u4ee5\u53ca\u5927\u91cf\u624b\u673a\u3001\u7535\u8111\u3001\u4f20\u9500\u4e66\u7c4d\u7b49\u6d89\u6848\u7269\u54c1\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl' 13 | 14 | SPIDER_MODULES = ['crawl.spiders'] 15 | NEWSPIDER_MODULE = 'crawl.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'crawl.middlewares.CrawlSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'crawl.middlewares.CrawlDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'crawl.pipelines.CrawlPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutotial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tutotial' 13 | 14 | SPIDER_MODULES = ['tutotial.spiders'] 15 | NEWSPIDER_MODULE = 'tutotial.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tutotial.middlewares.TutotialSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tutotial.middlewares.TutotialDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'tutotial.pipelines.TutotialPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | FEED_URI = u'/home/mazic/pp/jian.csv' 92 | FEED_FORMAT = 'CSV' 93 | -------------------------------------------------------------------------------- /泰迪杯尝试/数据爬取(去标签).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 15:53 4 | # @Author : mazicwong 5 | # @File : 数据爬取(去标签).py 6 | 7 | # 用正则表达式简单过滤html的标签 8 | import re 9 | 10 | 11 | def filter_tags(htmlstr): 12 | re_cdata = re.compile('//]*//\]\]>', re.I) # 匹配CDATA 13 | re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script 14 | re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style 15 | re_br = re.compile('\n \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u6df1\u591c\u730e\u8273\u5f3a\u5978\u5987\u5973 \u62c5\u5fc3\u88ab\u544a\u53d1\u7528\u6c34\u6ce5\u7816\u7838\u5934\u706d\u53e3\uff09\n
\u6b63\u4e49\u7f511\u670819\u65e5\u7535 \u201c\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0,\u4f60\u5bf9\u4e00\u5ba1\u5224\u51b3\u8ba4\u5b9a\u7684\u72af\u7f6a\u4e8b\u5b9e\u548c\u8bc1\u636e\u662f\u5426\u6709\u5f02\u8bae?\u201d\u8fd9\u662f\u4e00\u8d77\u7531\u6842\u6797\u5e02\u4eba\u6c11\u68c0\u5bdf\u9662\u4ee5\u5ba1\u5224\u76d1\u7763\u7a0b\u5e8f\u5411\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u63d0\u51fa\u6297\u8bc9\u7684\u66b4\u529b\u5211\u4e8b\u6848\u4ef6\u3002\u4e00\u5ba1\u6cd5\u9662\u4ee5\u5f3a\u5978\u7f6a\u5224\u5904\u949f\u67d0\u67d0\u6709\u671f\u5f92\u5211\u4e09\u5e74\u4e03\u4e2a\u6708\u3001\u4ee5\u6545\u610f\u6740\u4eba\u7f6a\u4ec5\u5224\u5904\u5176\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e00\u5e74\u516d\u4e2a\u6708\u3002
\u8bf4\u8d77\u8fd9\u8d77\u6848\u4ef6\u90a3\u5c31\u8981\u8ffd\u6eaf\u5230\u51e0\u5e74\u524d\u4e86\u30022015\u5e742\u670825\u65e5\u51cc\u6668,\u56db\u5904\u6e38\u8361\u51c6\u5907\u730e\u8273\u7684\u539f\u5ba1\u88ab\u544a\u4eba\u949f\u67d0\u67d0\u6ee1\u8138\u5931\u671b,\u9a7e\u9a76\u6469\u6258\u8f66\u6162\u60a0\u60a0\u5730\u5f80\u5bb6\u8d70,\u884c\u81f3\u8354\u6d66\u53bf\u67d0\u9547\u67d0\u8857,\u949f\u67d0\u67d0\u773c\u524d\u7a81\u7136\u4e00\u4eae,\u524d\u9762\u4ece\u9ebb\u5c06\u9986\u51fa\u95e8\u6b63\u72ec\u81ea\u6b65\u884c\u56de\u5bb6\u7684\u9ec4\u67d0\u67d0\u6b63\u9002\u5408\u4e0b\u624b\u554a!
\n
\u949f\u67d0\u67d0\u9042\u8d76\u4e0a\u524d\u4e3b\u52a8\u63d0\u51fa\u642d\u8f7d\u9ec4\u67d0\u67d0\u56de\u5bb6\u3002\u884c\u81f3\u504f\u50fb\u8def\u6bb5\u949f\u67d0\u67d0\u4fbf\u63d0\u51fa\u8981\u4e0e\u9ec4\u67d0\u67d0\u53d1\u751f\u6027\u5173\u7cfb,\u88ab\u62d2\u7edd\u540e\u949f\u67d0\u67d0\u76f4\u63a5\u5c06\u6b32\u9003\u8dd1\u7684\u9ec4\u67d0\u67d0\u6402\u62b1\u81f3\u8def\u8fb9\u575f\u5730,\u4e0d\u987e\u5bd2\u98ce\u51db\u51bd,\u5f3a\u884c\u5bf9\u9ec4\u67d0\u67d0\u5b9e\u65bd\u5978\u6deb\u3002\u4e8b\u6bd5,\u5fc3\u6ee1\u610f\u8db3\u7684\u949f\u67d0\u67d0\u62c5\u5fc3\u9ec4\u67d0\u67d0\u544a\u53d1,\u9042\u51b3\u5b9a\u706d\u53e3\u3002\u5728\u5c06\u9ec4\u67d0\u67d0\u6390\u6655\u540e,\u949f\u67d0\u67d0\u53cc\u624b\u4ece\u575f\u5806\u65c1\u642c\u8d77\u4e00\u5757\u6c34\u6ce5\u7816\u5f84\u76f4\u8fde\u7eed\u7838\u5411\u9ec4\u67d0\u67d0\u7684\u5934\u90e8\u2026\u2026
\u5341\u4f59\u5c0f\u65f6\u540e\u9ec4\u67d0\u67d0\u88ab\u4eba\u53d1\u73b0\u5e76\u83b7\u6551\u3002\u7ecf\u6cd5\u533b\u9274\u5b9a,\u88ab\u5bb3\u4eba\u9ec4\u67d0\u67d0\u7684\u4eba\u4f53\u635f\u4f24\u7a0b\u5ea6\u6784\u6210\u91cd\u4f24\u4e8c\u7ea7,\u5934\u9762\u90e8\u7684\u4eba\u4f53\u635f\u4f24\u6b8b\u75be\u7a0b\u5ea6\u5c5e\u516d\u7ea7\u6b8b\u75be\u3002
\u5ead\u540e\u4e00\u5468,\u6842\u6797\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u4f5c\u51fa\u7ec8\u5ba1\u5224\u51b3,\u7ef4\u6301\u539f\u5ba1\u6cd5\u9662\u5bf9\u949f\u67d0\u67d0\u5f3a\u5978\u7f6a\u7684\u91cf\u5211,\u5c06\u5176\u6545\u610f\u6740\u4eba\u7f6a\u7684\u91cf\u5211\u7531\u6709\u671f\u5f92\u5211\u516b\u5e74\u56db\u4e2a\u6708\u6539\u5224\u4e3a\u6709\u671f\u5f92\u5211\u5341\u4e94\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74,\u51b3\u5b9a\u6267\u884c\u6709\u671f\u5f92\u5211\u5341\u4e03\u5e74,\u5265\u593a\u653f\u6cbb\u6743\u5229\u4e09\u5e74\u3002
"}, "cmtId": "D8HAH1VS0001875P"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20160721/BSH7V8QF00014JB6.json: -------------------------------------------------------------------------------- 1 | {"newsId": "BSH7V8QF00014JB6", "date": "20160721", "source": "netease", "comments": {"link": "http://comment.news.163.com/news3_bbs/BSH7V8QF00014JB6.html"}, "contents": {"title": ["\u8fbd\u5b81\u906d\u66b4\u96e8\u4fb5\u88ad\u81f4\u57ce\u5e02\u5185\u6d9d \u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u4eba"], "link": "http://news.163.com/16/0721/19/BSH7V8QF00014JB6.html", "passage": "\n \uff08\u539f\u6807\u9898\uff1a\u8fbd\u5b81\u906d\u9047\u66b4\u96e8\u4fb5\u88ad\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\uff09\n
\u4e2d\u65b0\u793e\u6c88\u96337\u670821\u65e5\u7535 2016\u5e74\u5165\u6c5b\u4ee5\u6765\u6700\u5f3a\u964d\u96e821\u65e5\u4fb5\u88ad\u8fbd\u5b81\uff0c\u9020\u6210\u519c\u7530\u53d7\u707e\u57ce\u5e02\u5185\u6d9d\uff0c\u8be5\u7701\u5df2\u7d27\u6025\u8f6c\u79fb\u903e12\u4e07\u6c11\u4f17\u3002
\u4ece7\u670820\u65e5\u665a\u5f00\u59cb\uff0c\u672c\u8f6e\u5927\u8303\u56f4\u66b4\u96e8\u5f00\u59cb\u5728\u8fbd\u5b81\u897f\u90e8\u5730\u533a\u8086\u8650\uff0c\u6cbf\u6d77\u90e8\u5206\u6cb3\u6d41\u53d1\u751f\u6d2a\u6c34\uff0c\u81f321\u65e5\u964d\u96e8\u8303\u56f4\u6269\u6563\u5230\u8fbd\u5b81\u5168\u5883\u3002
\u4e2d\u65b0\u793e\u8bb0\u800521\u65e5\u5728\u7701\u4f1a\u6c88\u9633\u770b\u5230\uff0c\u5929\u7a7a\u9634\u6c89\u72b9\u5982\u508d\u665a\uff0c\u5927\u96e8\u503e\u76c6\u800c\u4e0b\uff0c\u5728\u4e00\u4e9b\u79ef\u6c34\u4e25\u91cd\u7684\u8857\u8def\u4e0a\uff0c\u6d88\u9632\u4eba\u5458\u51fa\u52a8\u76ae\u5212\u8247\u8fd0\u8f7d\u53d7\u56f0\u6c11\u4f17\u3002\u5728\u846b\u82a6\u5c9b\u5e02\uff0c\u90e8\u5206\u5730\u533a\u964d\u96e8\u91cf\u7a81\u7834\u6709\u6c14\u8c61\u8bb0\u5f55\u4ee5\u6765\u7684\u5386\u53f2\u6781\u503c\uff0c\u4e0d\u65ad\u6709\u8f66\u8f86\u5728\u79ef\u6c34\u91cc\u629b\u951a\uff0c\u5f53\u5730\u8fb9\u9632\u5b98\u5175\u8fde\u591c\u8f6c\u79fb\u4e8688\u540d\u8f96\u533a\u6c11\u4f17\u3002
\u636e\u8fbd\u5b81\u7701\u9632\u6c5b\u6297\u65f1\u6307\u6325\u90e8\u4ecb\u7ecd\uff0c\u622a\u81f3\u76ee\u524d\uff0c\u6c14\u8c61\u90e8\u95e8\u5df2\u63a5\u8fde\u53d1\u5e03\u66b4\u96e8\u7ea2\u8272\u9884\u8b667\u4e2a\uff0c\u66b4\u96e8\u6a59\u8272\u9884\u8b6616\u4e2a\uff0c\u5168\u7701\u6700\u5927\u964d\u6c34\u91cf\u51fa\u73b0\u5728\u846b\u82a6\u5c9b\u5e02\u7ee5\u4e2d\u53bf\uff0c\u8fbe\u5230396\u6beb\u7c73\u3002
\n
\u53d7\u5f3a\u964d\u96e8\u5f71\u54cd\uff0c\u622a\u81f37\u670821\u65e515\u65f6\u8bb8\uff0c\u8fbd\u5b81\u5168\u7701\u8d85\u6c5b\u9650\u6c34\u4f4d\u8fd0\u884c\u7684\u6c34\u5e93\u670930\u5ea7\uff0c\u5176\u4e2d\u5927\u4e2d\u578b\u6c34\u5e933\u5ea7\u300221\u65e5\uff0c\u8fbd\u5b8130\u5ea7\u5927\u578b\u6c34\u5e93\u603b\u84c4\u6c34\u91cf\u4e3a33.41\u4ebf\u7acb\u65b9\u7c73\uff0c\u6bd42015\u5e74\u540c\u671f\u591a5.76\u4ebf\u7acb\u65b9\u7c73\u3002
\u76ee\u524d\uff0c\u8fbd\u5b81\u846b\u82a6\u5c9b\u5e02\u670925\u4e2a\u4e61\u9547\u53d7\u707e\uff0c\u5012\u584c\u623f\u5c4b28\u95f4\uff0c\u519c\u4f5c\u7269\u53d7\u707e\u9762\u79ef39.2\u4e07\u4ea9\uff0c\u635f\u6bc1\u5824\u96320.8\u516c\u91cc\uff0c\u76f4\u63a5\u7ecf\u6d4e\u635f\u59311900\u4e07\u5143\u4eba\u6c11\u5e01\u3002\u5176\u4ed6\u5730\u533a\u707e\u60c5\u6b63\u5728\u8fdb\u4e00\u6b65\u6838\u5b9e\u4e2d\u3002\u672c\u8f6e\u66b4\u96e8\u8fbd\u5b81\u5171\u8f6c\u79fb12\u4e2a\u5e02\u7684\u6c11\u4f1712.59\u4e07\u4eba\uff0c\u6682\u65f6\u6ca1\u6709\u6536\u5230\u4eba\u5458\u4f24\u4ea1\u62a5\u544a\u3002
\u7a81\u5982\u5176\u6765\u7684\u66b4\u96e8\u4ea6\u4f7f\u4ea4\u901a\u51fa\u884c\u53d7\u5230\u4e25\u91cd\u5f71\u54cd\uff0c\u8fbd\u5b81\u5883\u518516\u6761\u9ad8\u901f\u516c\u8def\u5c01\u95ed\u6216\u9650\u884c\uff1b39\u8d9f\u65c5\u5ba2\u5217\u8f66\u4e34\u65f6\u505c\u8fd0\uff1b\u6cbf\u6d77\u6e2f\u53e3\u53d7\u5927\u98ce\u5f71\u54cd\u90e8\u5206\u73ed\u6b21\u505c\u822a\u3002
\u6c14\u8c61\u90e8\u95e8\u9884\u8ba1\uff0c22\u65e5\u8fbd\u4e1c\u5730\u533a\u7684\u672c\u6eaa\u3001\u4e39\u4e1c\u7b49\u5730\u8fd8\u5c06\u7ee7\u7eed\u906d\u9047\u66b4\u96e8\u3002
"}, "cmtId": "BSH7V8QF00014JB6"} -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jnuxshc project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jnuxshc' 13 | 14 | SPIDER_MODULES = ['jnuxshc.spiders'] 15 | NEWSPIDER_MODULE = 'jnuxshc.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'oozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | #DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'jnuxshc.middlewares.JnuxshcSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'jnuxshc.middlewares.JnuxshcDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 66 | #ITEM_PIPELINES = { 67 | # 'jnuxshc.pipelines.JnuxshcPipeline': 300, 68 | #} 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | FEED_URI = u'./jnu.csv' 91 | FEED_FORMAT = 'CSV' 92 | 93 | FEED_EXPORTERS = { 94 | 'csv': 'jnuxshc.spiders.csv_item_exporter.MyProjectCsvItemExporter', 95 | } #jnuxshc为工程名 96 | FIELDS_TO_EXPORT = [ 97 | 'time', 98 | 'title', 99 | 'intro' 100 | ] 101 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20160418/023091.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "023091", "comments": {"link": "http://coral.qq.com/1373761671"}, "date": "20160418", "contents": {"link": "https://news.qq.com/a/20160418/023091.htm", "title": ["\u5df4\u897f\u4f17\u9662\u5f39\u52be\u603b\u7edf\u6848\u83b7\u901a\u8fc7 \u7f57\u585e\u592b\u653f\u515a\u627f\u8ba4\u843d\u8d25"], "passage": "\n\n\n\n\n\n\n\n\n\r\n\r\n\r\n\r\n \r\n\r\n\r\n\u4e2d\u65b0\u7f514\u670818\u65e5\u7535 \u7efc\u5408\u5916\u5a92\u62a5\u9053\uff0c\u5df4\u897f\u4f17\u8bae\u966217\u65e5\u9488\u5bf9\u662f\u5426\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u8fdb\u884c\u8868\u51b3\uff0c\u5230\u76ee\u524d\u4e3a\u6b62\uff0c513\u540d\u8bae\u5458\u4e2d\u5df2\u6709\u81f3\u5c11342\u540d\u8bae\u5458\u5bf9\u5f39\u52be\u603b\u7edf\u7f57\u585e\u592b\u6295\u4e86\u8d5e\u6210\u7968\uff0c\u8fd9\u610f\u5473\u7740\u5f39\u52be\u6848\u5728\u4f17\u9662\u83b7\u5f97\u901a\u8fc7\uff0c\u5f39\u52be\u603b\u7edf\u7a0b\u5e8f\u5c06\u7ee7\u7eed\u3002\u5f39\u52be\u62a5\u544a\u5c06\u9012\u4ea4\u7ed9\u53c2\u8bae\u9662\u505a\u51fa\u6700\u7ec8\u8868\u51b3\u3002\u800c\u7f57\u585e\u592b\u6240\u5c5e\u653f\u515a\u8868\u793a\u5927\u52bf\u5df2\u53bb\uff0c\u65e0\u6cd5\u907f\u514d\u603b\u7edf\u906d\u5f39\u52be\u3002\u62a5\u9053\u79f0\uff0c\u5df4\u897f\u6267\u653f\u515a\u52b3\u5de5\u515a\u515a\u56e2\u9886\u8896\u5b63\u9a6c\u745e\u65af\u4e5f\u8868\u793a\uff0c\u5bf9\u4f17\u8bae\u9662\u5f39\u52be\u7f57\u585e\u592b\u7684\u8868\u51b3\u627f\u8ba4\u5931\u8d25\u3002\u4ed6\u5728\u4f17\u9662\u53d7\u8bbf\u8bf4\uff1a\u201c\u73b0\u5728\u8981\u5728\u53c2\u9662\u7eed\u6218\u4e86\u3002\u201d\u62a5\u9053\u6307\u51fa\uff0c\u6839\u636e\u5df4\u897f\u6cd5\u5f8b\uff0c\u4e3b\u5f20\u5f39\u52be\u4e00\u65b9\u5fc5\u987b\u5728\u6b64\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u4e09\u5206\u4e4b\u4e8c\u7684\u6295\u7968\uff0c\u5373\u5728513\u5f20\u6295\u7968\u4e2d\u4e89\u53d6\u5230342\u7968\uff0c\u624d\u80fd\u5c06\u52a8\u8bae\u63d0\u4ea4\u5230\u53c2\u8bae\u9662\uff0c\u5e76\u7531\u53c2\u8bae\u9662\u51b3\u5b9a\u603b\u7edf\u662f\u5426\u4ece\u4e8b\u4e86\u975e\u6cd5\u884c\u4e3a\u3002\u5f39\u52be\u6848\u5728\u4f17\u9662\u901a\u8fc7\u540e\uff0c\u53c2\u8bae\u9662\u5c06\u5bf9\u5176\u8fdb\u884c\u9996\u8f6e\u8868\u51b3\uff0c\u65f6\u95f4\u53ef\u80fd\u57285\u6708\u3002\u5982\u679c\u53c2\u8bae\u9662\u5728\u9996\u8f6e\u8868\u51b3\u4e2d\u83b7\u5f97\u7b80\u5355\u591a\u6570\u652f\u6301\uff0c\u7f57\u585e\u592b\u987b\u79bb\u804c180\u5929\uff0c\u5176\u95f4\u603b\u7edf\u4e00\u804c\u7531\u526f\u603b\u7edf\u4ee3\u7406\u3002\u53c2\u8bae\u9662\u4e4b\u540e\u5c06\u542c\u53d6\u8bc1\u636e\uff0c\u518d\u8fdb\u884c\u7b2c\u4e8c\u8f6e\u8868\u51b3\uff0c\u5982\u679c2/3\u4ee5\u4e0a\u7684\u8bae\u5458\u652f\u6301\u5f39\u52be\uff0c\u5219\u7f57\u585e\u592b\u4e0b\u53f0\uff0c\u526f\u603b\u7edf\u7279\u6885\u5c14\u63a5\u4efb\uff1b\u5982\u679c\u53c2\u8bae\u9662\u652f\u6301\u5f39\u52be\u7684\u8bae\u5458\u4e0d\u52302/3\uff0c\u7f57\u585e\u592b\u6062\u590d\u603b\u7edf\u804c\u4f4d\u3002\u62a5\u9053\u79f0\uff0c\u56e0\u4e3a\u5df4\u897f\u53c2\u8bae\u9662\u548c\u4f17\u8bae\u9662\u7684\u6784\u6210\u6781\u4e3a\u76f8\u4f3c\uff0c\u6240\u4ee5\u53c2\u8bae\u9662\u53ef\u80fd\u5f97\u51fa\u4e0e\u4f17\u8bae\u9662\u76f8\u540c\u7684\u7ed3\u8bba\u3002\u5982\u679c\u7f57\u585e\u592b\u6700\u7ec8\u88ab\u5f39\u52be\u4e0b\u53f0\uff0c\u7279\u6885\u5c14\u5c06\u63a5\u4efb\u603b\u7edf\u804c\u4f4d\uff0c\u4f46\u662f\u56e0\u4e3a\u7279\u6885\u5c14\u4e5f\u5377\u5165\u8d2a\u8150\u6848\u4ef6\u4e2d\uff0c\u7f57\u585e\u592b\u7684\u652f\u6301\u8005\u5df2\u7ecf\u5f00\u59cb\u5bf9\u4ed6\u8fdb\u884c\u5f39\u52be\u884c\u52a8\u3002\u8fd9\u4e5f\u5c31\u610f\u5473\u7740\uff0c\u5728\u4eca\u5e748\u67085\u65e5\u81f321\u65e5\u5df4\u897f\u9996\u6b21\u4e3e\u884c\u590f\u5b63\u5965\u8fd0\u4f1a\u65f6\uff0c\u5176\u653f\u5c40\u4ecd\u7136\u5728\u6df7\u4e71\u4e4b\u4e2d\u3002\u636e\u6089\uff0c\u4ece\u5df4\u897f\u5f53\u5730\u65f6\u95f44\u670815\u65e5\u65e9\u4e0a\u5f00\u59cb\u4e00\u76f4\u523017\u65e5\u6e05\u6668\uff0c\u6709120\u540d\u8bae\u5458\u53c2\u52a0\u4e86\u5173\u4e8e\u662f\u5426\u5f39\u52be\u7f57\u585e\u592b\u7684\u8fa9\u8bba\uff0c\u8fa9\u8bba\u65f6\u95f4\u8d85\u8fc743\u4e2a\u5c0f\u65f6\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20161227/011065.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "011065", "comments": {"link": "http://coral.qq.com/1687646782"}, "date": "20161227", "contents": {"link": "https://news.qq.com/a/20161227/011065.htm", "title": ["\u5c0f\u4f19\u51fa\u5dee\u90d1\u5dde\u9047\u96fe\u973e\u8bc9\u653f\u5e9c\u88ab\u9a73\uff1a\u5e94\u5148\u7533\u8bf7\u653f\u5e9c\u8d54\u507f\u53e3\u7f69\u94b1"], "passage": "\u56e0\u51fa\u5dee\u90d1\u5dde\u53d1\u73b0\u5f53\u5730\u96fe\u973e\u4e25\u91cd\uff0c\u65e5\u524d\uff0c\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u9a73\u56de\u539f\u544a\u8d77\u8bc9\uff0c\u7406\u7531\u662f\u5176\u8d77\u8bc9\u524d\u5e76\u672a\u5411\u90d1\u5dde\u5e02\u653f\u5e9c\u63d0\u51fa\u8fc7\u8d54\u507f\u7533\u8bf7\u3002\u6cb3\u5357\u7701\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u884c\u653f\u8d54\u507f\u88c1\u5b9a\u4e66\u3002\u6f8e\u6e43\u65b0\u95fb\uff08www.thepaper.cn\uff09\u83b7\u5f97\u7684\u88c1\u5b9a\u4e66\u663e\u793a\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u4f9d\u636e\u300a\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u8d54\u507f\u6cd5\u300b\u7b2c\u4e5d\u6761\u7b2c\u4e8c\u6b3e\u89c4\u5b9a\uff0c\u6b64\u5916\uff0c\u4f9d\u636e\u300a\u6700\u9ad8\u4eba\u6c11\u6cd5\u9662\u5173\u4e8e\u5ba1\u7406\u884c\u653f\u8d54\u507f\u6848\u4ef6\u82e5\u5e72\u95ee\u9898\u7684\u89c4\u5b9a\u300b\u7b2c\u56db\u6761\u7b2c\u4e8c\u6b3e\uff0c\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u8ba4\u4e3a\uff0c\u539f\u544a\u5728\u63d0\u8d77\u8bc9\u8bbc\u524d\uff0c\u5176\u8d54\u507f\u8bf7\u6c42\u5c1a\u672a\u7ecf\u8fc7\u90d1\u5dde\u5e02\u4eba\u6c11\u653f\u5e9c\u5148\u884c\u5904\u7406\u3002\u56e0\u6b64\uff0c\u6cd5\u9662\u5e94\u5f53\u9a73\u56de\u539f\u544a\u8d77\u8bc9\u300212\u670826\u65e5\u665a\uff0c\u5b59\u6d2a\u5f6c\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\uff0c\u4ed6\u4e8e26\u65e5\u5f53\u5929\u6536\u5230\u4e86\u8be5\u88c1\u5b9a\u4e66\u3002\u5b59\u6d2a\u5f6c\u8bf4\uff0c\u8be5\u88c1\u5b9a\u5728\u4ed6\u7684\u610f\u6599\u4e4b\u4e2d\u3002\u201c\u73b0\u5728\u8fd8\u6ca1\u51b3\u5b9a\u8981\u4e0d\u8981\u4e0a\u8bc9\uff0c\u4f46\u662f\u54a8\u8be2\u4e86\u5f8b\u5e08\u4e5f\u8bf4\u4e0a\u8bc9\u4e5f\u6ca1\u6709\u610f\u4e49\uff0c\u4f30\u8ba1\u4e0d\u4f1a\u7ee7\u7eed\uff08\u4e0a\u8bc9\uff09\u4e86\u201d\u3002\u6f8e\u6e43\u65b0\u95fb\u6ce8\u610f\u5230\uff0c\u300a\u8d54\u507f\u6cd5\u300b\u89c4\u5b9a\uff0c\u8d54\u507f\u4e49\u52a1\u673a\u5173\u53ef\u4ee5\u5728\u4e24\u4e2a\u6708\u5185\u505a\u51fa\u662f\u5426\u8d54\u507f\u7684\u51b3\u5b9a\u3002\u4ed6\u8bf4\uff0c\u5728\u5411\u5e02\u653f\u5e9c\u63d0\u51fa\u8d54\u507f\u7533\u8bf7\u4e4b\u540e\uff0c\u81ea\u5df1\u53c8\u5411\u6cd5\u9662\u9012\u4ea4\u4e86\u53e6\u5916\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u8981\u6c42\u786e\u8ba4\u90d1\u5dde\u5e02\u653f\u5e9c\u6cbb\u973e\u4e0d\u4f5c\u4e3a\uff0c\u672a\u4e25\u683c\u5c65\u884c\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u5b9a\u804c\u8d23\uff0c\u201c\u73b0\u5728\u4e3b\u8981\u770b\u8fd9\u4e2a\u8bc9\u8bbc\u80fd\u5426\u7acb\u6848\u4e86\u3002\u201d\u6f8e\u6e43\u65b0\u95fb\u6b64\u524d\u62a5\u9053\uff0c11\u670820\u65e5\uff0c\u5b59\u6d2a\u5f6c\u5728\u90d1\u5dde\u51fa\u5dee\u65f6\uff0c\u5728\u8be5\u5e02\u5730\u6807\u5efa\u7b51\u4e8c\u4e03\u5854\u9644\u8fd1\u611f\u89c9\u201c\u7279\u522b\u545b\u201d\uff0c\u4ed6\u4fbf\u4e70\u4e86\u4e00\u526f\u4ef7\u503c32\u5143\u7684\u9632\u973e\u53e3\u7f69\u3002\u5f53\u5929\u90d1\u5dde\u5e02AQI\u4e3a253\uff0c\u5c5e\u4e8e\u91cd\u5ea6\u6c61\u67d3\u3002\u5f53\u665a\uff0c\u5b59\u6d2a\u5f6c\u62df\u51fa\u4e00\u4efd\u8bc9\u8bbc\u72b6\uff0c\u79f0\u4f9d\u636e\u300a\u73af\u5883\u4fdd\u62a4\u6cd5\u300b\u53ca\u300a\u5927\u6c14\u6c61\u67d3\u9632\u6cbb\u6cd5\u300b\u89c4\u5b9a\uff0c\u90d1\u5dde\u5e02\u653f\u5e9c\u5e94\u5bf9\u672c\u884c\u653f\u533a\u57df\u7684\u73af\u5883\u8d28\u91cf\u8d1f\u8d23\u3002\u5b59\u6d2a\u5f6c\u8bf7\u6c42\u4f9d\u6cd5\u5224\u4ee4\u88ab\u544a\u8d54\u507f\u572811\u670820\u65e5\u90d1\u5dde\u96fe\u973e\u671f\u95f4\u7684\u53e3\u7f69\u8d2d\u4e70\u8d39\u7528\uff0c\u5e76\u5224\u4ee4\u88ab\u544a\u627f\u62c5\u672c\u6848\u8bc9\u8bbc\u8d39\u3002\u65b0\u4e61\u5e02\u4e2d\u7ea7\u4eba\u6c11\u6cd5\u9662\u66fe\u4e8e11\u670825\u65e5\u7ec4\u6210\u4e86\u5408\u8bae\u5ead\uff0c\u53d7\u7406\u6b64\u6848\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CrawlDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /crawl/暨南大学新闻爬虫/jnuxshc/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JnuxshcSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class JnuxshcDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /机器学习入门/label_propagation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 11:28 4 | # @Author : mazicwong 5 | # @File : label_propagation.py 6 | 7 | import time 8 | import numpy as np 9 | 10 | 11 | # return k neighbors index 12 | def navie_knn(dataSet, query, k): 13 | numSamples = dataSet.shape[0] 14 | 15 | ## step 1: calculate Euclidean distance 16 | diff = np.tile(query, (numSamples, 1)) - dataSet 17 | squaredDiff = diff ** 2 18 | squaredDist = np.sum(squaredDiff, axis=1) # sum is performed by row 19 | 20 | ## step 2: sort the distance 21 | sortedDistIndices = np.argsort(squaredDist) 22 | if k > len(sortedDistIndices): 23 | k = len(sortedDistIndices) 24 | 25 | return sortedDistIndices[0:k] 26 | 27 | 28 | # build a big graph (normalized weight matrix) 29 | def buildGraph(MatX, kernel_type, rbf_sigma=None, knn_num_neighbors=None): 30 | num_samples = MatX.shape[0] 31 | affinity_matrix = np.zeros((num_samples, num_samples), np.float32) 32 | if kernel_type == 'rbf': 33 | if rbf_sigma == None: 34 | raise ValueError('You should input a sigma of rbf kernel!') 35 | for i in range(num_samples): 36 | row_sum = 0.0 37 | for j in range(num_samples): 38 | diff = MatX[i, :] - MatX[j, :] 39 | affinity_matrix[i][j] = np.exp(sum(diff ** 2) / (-2.0 * rbf_sigma ** 2)) 40 | row_sum += affinity_matrix[i][j] 41 | affinity_matrix[i][:] /= row_sum 42 | elif kernel_type == 'knn': 43 | if knn_num_neighbors == None: 44 | raise ValueError('You should input a k of knn kernel!') 45 | for i in range(num_samples): 46 | k_neighbors = navie_knn(MatX, MatX[i, :], knn_num_neighbors) 47 | affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors 48 | else: 49 | raise NameError('Not support kernel type! You can use knn or rbf!') 50 | 51 | return affinity_matrix 52 | 53 | 54 | # label propagation 55 | def labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='rbf', rbf_sigma=1.5, \ 56 | knn_num_neighbors=10, max_iter=500, tol=1e-3): 57 | # initialize 58 | num_label_samples = Mat_Label.shape[0] 59 | num_unlabel_samples = Mat_Unlabel.shape[0] 60 | num_samples = num_label_samples + num_unlabel_samples 61 | labels_list = np.unique(labels) 62 | num_classes = len(labels_list) 63 | 64 | MatX = np.vstack((Mat_Label, Mat_Unlabel)) 65 | clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32) 66 | for i in range(num_label_samples): 67 | clamp_data_label[i][labels[i]] = 1.0 68 | 69 | label_function = np.zeros((num_samples, num_classes), np.float32) 70 | label_function[0: num_label_samples] = clamp_data_label 71 | label_function[num_label_samples: num_samples] = -1 72 | 73 | # graph construction 74 | affinity_matrix = buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors) 75 | 76 | # start to propagation 77 | iter = 0; 78 | pre_label_function = np.zeros((num_samples, num_classes), np.float32) 79 | changed = np.abs(pre_label_function - label_function).sum() 80 | while iter < max_iter and changed > tol: 81 | if iter % 1 == 0: 82 | print 83 | "---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed) 84 | pre_label_function = label_function 85 | iter += 1 86 | 87 | # propagation 88 | label_function = np.dot(affinity_matrix, label_function) 89 | 90 | # clamp 91 | label_function[0: num_label_samples] = clamp_data_label 92 | 93 | # check converge 94 | changed = np.abs(pre_label_function - label_function).sum() 95 | 96 | # get terminate label of unlabeled data 97 | unlabel_data_labels = np.zeros(num_unlabel_samples) 98 | for i in range(num_unlabel_samples): 99 | unlabel_data_labels[i] = np.argmax(label_function[i + num_label_samples]) 100 | 101 | return unlabel_data_labels 102 | -------------------------------------------------------------------------------- /crawl/简书首页爬虫/tutotial/tutotial/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TutotialSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TutotialDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20171129/013590.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "013590", "comments": {"link": "http://coral.qq.com/2259249504"}, "date": "20171129", "contents": {"link": "https://news.qq.com/a/20171129/013590.htm", "title": ["\u8054\u901a\u7545\u6e38\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u8bed\u97f3\u3001\u6d41\u91cf\u5168\u56fd\u7545\u723d\u4f7f\u7528"], "passage": "\u4e2d\u56fd\u8054\u901a\u6b63\u5f0f\u63a8\u51fa\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u542b\u8d85\u5927\u6d41\u91cf\u3001\u8d85\u591a\u8bed\u97f3\uff0c\u53ef\u5728\u5168\u56fd\u8303\u56f4\u5185\u7545\u723d\u4f7f\u7528\u3002\u4e0d\u9650\u6d41\u91cf\u3001\u4e0d\u9650\u8bed\u97f3\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff0c\u5c06\u7ed9\u7528\u6237\u5e26\u6765\u590f\u5929\u4eab\u53d7\u51b0\u6fc0\u51cc\u4e00\u6837\u7684\u7545\u723d\u611f\uff0c\u662f\u4e00\u6b3e\u5f70\u663e\u8054\u901a\u4e2a\u6027\u7684\u4ea7\u54c1\uff0c\u4f5c\u4e3a\u4e1a\u754c\u9996\u521b\uff0c\u51b0\u6fc0\u51cc\u5957\u9910\u4ea7\u54c1\u663e\u793a\u4e86\u4e2d\u56fd\u8054\u901a\u4e00\u76f4\u4ee5\u6765\u4fdd\u6301\u7740\u7684\u6d3b\u529b\u4e0e\u521b\u65b0\u3002\u6d41\u91cf\u4e0d\u9650\u91cf\uff0c\u7d27\u8ddf\u5f53\u4e0b\u5e74\u8f7b\u4eba\u7231\u8ffd\u5267\u3001\u7231\u76f4\u64ad\u7684\u6d88\u8d39\u4e60\u60ef\uff0c\u540c\u65f6\u6ee1\u8db3\u7ecf\u5e38\u51fa\u5dee\u3001\u65c5\u6e38\u7b49\u5546\u65c5\u4eba\u58eb\u7684\u9700\u6c42\u3002\u901a\u8bdd\u4e0d\u9650\u91cf\uff0c\u8ba9\u7528\u6237\u4e0e\u5bb6\u4eba\u670b\u53cb\u8fdb\u884c\u901a\u8bdd\u65f6\uff0c\u4e0d\u518d\u957f\u8bdd\u77ed\u8bf4\uff0c\u5b9e\u73b0\u771f\u6b63\u610f\u4e49\u4e0a\u7684\u7545\u723d\u804a\u5929\uff0c\u5168\u56fd\u901a\u7528\uff0c\u65e0\u6f2b\u6e38\u3001\u957f\u9014\u8d39\u7528\u4ea7\u751f\u3002\u73b0\u767b\u5f55\u8054\u901a\u7f51\u4e0a\u8425\u4e1a\u5385\uff0c\u5373\u53ef\u9996\u670899\u5143\u4eab\u53d7\u4e0d\u9650\u91cf\u7684\u51b0\u6fc0\u51cc\u5957\u9910\uff1b\u9884\u5b5899\u5143\u9001100\u5143\uff0c\u6708\u8d39\u6c38\u4e455\u6298\uff08\u539f\u4ef7398\uff0c\u73b0\u4ec5\u9700\u6708\u8d39199\uff09\uff1b\u4ec5\u9650\u8054\u901a\u7f51\u4e0a\u5546\u57ce\u529e\u7406\u7528\u6237\u3002\u751f\u65e5\u53f7\u3001\u60c5\u4fa3\u53f7\u7b49\u968f\u610f\u9009\uff0c\u8ba9\u4f60\u7684\u624b\u673a\u53f7\u4e0d\u518d\u662f\u51b7\u51b0\u51b0\u7684\u4e00\u7ec4\u6570\u5b57\u3002\u4e2d\u56fd\u8054\u901a\u4ee5\u7528\u6237\u5229\u76ca\u4e3a\u6838\u5fc3\uff0c\u5df2\u5b8c\u6210\u4e00\u7cfb\u5217\u521b\u65b0\u52a8\u4f5c\uff0c\u6b64\u524d\uff0c\u8054\u5408\u4e92\u8054\u7f51\u516c\u53f8\u63a8\u51fa\u4e86\u8682\u8681\u5b9d\u5361\u3001\u817e\u8baf\u738b\u5361\u7b49\u521b\u65b0\u4ea7\u54c1\uff0c\u6b64\u6b21\uff0c\u63a8\u51fa\u7684\u5168\u56fd\u7545\u723d\u51b0\u6fc0\u51cc\u5957\u9910\uff0c \u4e5f\u662f\u54cd\u5e94\u56fd\u5bb6\u63d0\u901f\u964d\u8d39\u653f\u7b56\uff0c\u8df5\u884c\u201c\u6d41\u91cf\u653e\u5fc3\u7528\u201d\u7684\u53c8\u4e00\u529b\u4e3e\u3002\u672a\u6765\uff0c\u4e2d\u56fd\u8054\u901a\u5c06\u628a\u51b0\u6fc0\u51cc\u5957\u9910\u4f5c\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u6807\u6746\uff0c\u4ee5\u96f6\u6346\u7ed1\u3001\u6d41\u91cf\u8d85\u591a\u3001\u64cd\u4f5c\u7b80\u5355\u3001\u65b9\u4fbf\u7528\u6237\u4f7f\u7528\u7b49\u4e3a\u4ea7\u54c1\u4f18\u5316\u7684\u539f\u5219\uff0c\u63a8\u51fa\u66f4\u591a\u201c\u7c7b\u51b0\u6fc0\u51cc\u5957\u9910\u201d\u4ea7\u54c1\uff0c\u5728\u8bed\u97f3\u3001\u6d41\u91cf\u4eab\u53d7\u8d85\u7ea7\u989d\u5ea6\u7684\u57fa\u7840\u4e0a\uff0c\u5b9e\u73b0\u7ec8\u7aef\u5957\u9910\u4e0d\u6346\u7ed1\u3001\u6863\u4f4d\u968f\u610f\u66f4\u6362\u3001\u5957\u9910\u6863\u4f4d\u7cbe\u7b80\u3001\u65b0\u8001\u7528\u6237\u4f18\u60e0\u540c\u4eab\u7b49\u7279\u70b9\u7684\u4ea7\u54c1\u4f18\u5316\uff0c\u4e3a\u7528\u6237\u5e26\u6765\u66f4\u52a0\u653e\u5fc3\u7684\u4f7f\u7528\u4f53\u9a8c\uff0c\u5e76\u4ece\u591a\u4e2a\u5c42\u9762\u4e30\u5bcc\u8054\u901a\u201c\u6c834G+\u201d\u6781\u901f\u7f51\u7edc\u7684\u5320\u5fc3\u610f\u4e49\u3002\u51b0\u6fc0\u51cc\u5957\u9910\u5df2\u5728\u5168\u56fd\u8303\u56f4\u5185\u9646\u7eed\u4e0a\u5e02\u53d1\u552e\uff0c\u8be6\u8be210010\u6216\u54a8\u8be2\u5f53\u5730\u8425\u4e1a\u5385\u3002http://www.10010.com/goodsdetail/111711031180.html\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u817e\u8baf\u7f51\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HIR5JP0001875P.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HIR5JP0001875P", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_shehui7_bbs/D8HIR5JP0001875P.html"}, "newsId": "D8HIR5JP0001875P", "contents": {"passage": "\n \uff08\u539f\u6807\u9898\uff1a\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u642d\u8baa\u5973\u751f\u79f0\u5176\u53ef\u5b89\u6392\u5de5\u4f5c \u804c\u6821\u5973\u5b69\u88ab\u9a974000\u5143\uff09\n
![]()
\u5c01\u9762\u65b0\u95fb\u8baf 1\u670818\u65e5\uff0c\u7ef5\u9633\u67d0\u804c\u682117\u5c81\u5973\u751f\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u7537\u5b50\u642d\u8baa\u81ea\u79f0\u662f\u6559\u80b2\u5c40\u526f\u5c40\u957f\uff0c\u53ef\u4ee5\u4e3a\u5176\u5b89\u6392\u5de5\u4f5c\uff0c\u9a97\u5f97\u5973\u5b69\u4fe1\u4efb\u3002\u4ea4\u8c08\u540e\u8be5\u7537\u5b50\u9a6c\u4e0a\u53c2\u52a0\u8001\u5c40\u957f\u751f\u65e5\u5bb4\uff0c\u9a97\u5f97\u5973\u5b694000\u5143\u751f\u6d3b\u8d39\u3002
18\u65e5\u4e0b\u53483\u70b9\uff0c\u7279\u5de1\u8b66\u652f\u961f\u5de1\u903b\u4e00\u5927\u961f\u63a5\u5230\u62a5\u8b66\u79f0\uff0c\u5728\u9752\u5e74\u5e7f\u573a\u6709\u4e00\u5973\u5b69\u88ab\u9a97\u3002\u6c11\u8b66\u8d76\u5230\u73b0\u573a\u4e86\u89e3\u5230\u5973\u5b69\u59d3\u656c\uff0c\u4eca\u5e7417\u5c81\uff0c\u7ef5\u9633\u67d0\u804c\u6821\u5b66\u751f\uff0c\u5973\u5b69\u54ed\u8bc9\u5979\u88ab\u4e00\u4e2a\u81ea\u79f0\u6559\u80b2\u5c40\u526f\u5c40\u957f\u7684\u9a97\u5b50\u9a97\u8d70\u4e864000\u5143\u3002
\u201c\u4eca\u5929\u4e0b\u5348\u5979\u8def\u8fc7\u9752\u5e74\u5e7f\u573a\u65f6\uff0c\u4e00\u4e2d\u5e74\u7537\u5b50\u548c\u5979\u642d\u8baa\uff0c\u8bf4\u5979\u5f88\u50cf\u540c\u4e8b\u7684\u5973\u513f\uff0c\u8fd8\u8868\u626c\u5979\u957f\u5f97\u6f02\u4eae\uff0c\u7537\u5b50\u53c8\u95ee\u5c0f\u656c\u591a\u5927\u4e86\uff0c\u662f\u5b66\u751f\u5417\uff1f\u5728\u90a3\u4e2a\u5b66\u6821\u4e0a\u5b66\uff1f\u201d\u5c0f\u656c\u544a\u8bc9\u8b66\u65b9\uff0c\u5979\u6ca1\u6709\u9632\u5907\uff0c\u90fd\u4e00\u4e00\u56de\u7b54\uff0c\u63a5\u7740\u7537\u5b50\u8bf4\u81ea\u5df1\u662f\u6559\u80b2\u5c40\u7684\u674e\u526f\u5c40\u957f\uff0c\u7b49\u5c0f\u656c\u6bd5\u4e1a\u4e86\u53ef\u4ee5\u5e2e\u52a9\u5979\u5b89\u6392\u5de5\u4f5c\u3002
\n
\u542c\u8bf4\u53ef\u4ee5\u5b89\u6392\u5de5\u4f5c\uff0c\u5c0f\u656c\u89c9\u5f97\u81ea\u5df1\u9047\u5230\u8d35\u4eba\u4e86\uff0c\u5f7c\u6b64\u76f8\u8c08\u751a\u6b22\u3002\u6b64\u65f6\u8fd9\u540d\u674e\u526f\u5c40\u957f\u8bf4\uff0c\u4ed6\u4e0a\u5348\u521a\u5f00\u5b8c\u4f1a\u8fd9\u4f1a\u8981\u53bb\u53c2\u52a0\u8001\u5c40\u957f\u7684\u751f\u65e5\u5bb4\uff0c\u7531\u4e8e\u6ca1\u5e26\u5361\u6ca1\u6cd5\u53d6\u94b1\uff0c\u8bf7\u5c0f\u656c\u5e2e\u4ed6\u5148\u62ff\u70b9\u94b1\u3002\u201c\u4ed6\u95ee\u6211\u6709\u591a\u5c11\u94b1\uff0c\u6b63\u597d\u8eab\u4e0a\u67094000\u5143\u751f\u6d3b\u8d39\u3002\u201d\u6beb\u65e0\u9632\u5907\u7684\u5973\u5b69\u76f8\u4fe1\u4e86\u526f\u5c40\u957f\u6682\u65f6\u501f\u7528\u4f1a\u8fd8\u94b1\u7684\u8bf4\u6cd5\uff0c\u5c064000\u5143\u94b1\u5168\u90e8\u62ff\u7ed9\u4e86\u4ed6\uff0c\u770b\u7740\u5f88\u5feb\u6d88\u5931\u5728\u4eba\u7fa4\u4e2d\u7684\u526f\u5c40\u957f\uff0c\u5c0f\u656c\u624d\u5f00\u59cb\u6000\u7591\uff0c\u8d8a\u60f3\u8d8a\u4e0d\u5bf9\u52b2\uff0c\u4e8e\u662f\u7acb\u5373\u62a5\u8b66\uff0c\u76ee\u524d\u8b66\u65b9\u5df2\u5c55\u5f00\u8fdb\u4e00\u6b65\u8c03\u67e5\u3002
\u8b66\u65b9\u63d0\u9192\u5e02\u6c11\uff0c\u9a97\u5b50\u4f1a\u7279\u610f\u7784\u51c6\u90a3\u4e9b\u6d89\u4e8b\u4e0d\u6df1\uff0c\u5584\u826f\u7684\u5c0f\u5973\u5b69\u884c\u9a97\uff0c\u5b66\u6821\u548c\u5bb6\u957f\u8981\u591a\u52a0\u5f3a\u8fd9\u65b9\u9762\u7684\u6559\u80b2\uff0c\u5c0f\u5b69\u81ea\u5df1\u4e5f\u6700\u597d\u4e0d\u8981\u56de\u5e94\u964c\u751f\u4eba\u4e3b\u52a8\u642d\u8baa\u3002
", "link": "http://news.163.com/18/0119/18/D8HIR5JP0001875P.html", "title": ["\u7537\u5b50\u5192\u5145\u6559\u80b2\u5c40\u957f\u81ea\u79f0\u53ef\u5b89\u6392\u5de5\u4f5c \u5973\u5b69\u88ab\u9a974000\u5143"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180116/D897H80K0001899O.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D897H80K0001899O", "date": "20180116", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D897H80K0001899O.html"}, "contents": {"title": ["\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd"], "link": "http://news.163.com/18/0116/12/D897H80K0001899O.html", "passage": "\n \uff08\u539f\u6807\u9898\uff1a\u4e60\u8fd1\u5e73\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\uff09\n
\u592e\u89c6\u65b0\u95fb\u5ba2\u6237\u7aef1\u670816\u65e5\u6d88\u606f\uff0c\u56fd\u5bb6\u4e3b\u5e2d\u4e60\u8fd1\u5e7316\u65e5\u5e94\u7ea6\u540c\u7f8e\u56fd\u603b\u7edf\u7279\u6717\u666e\u901a\u7535\u8bdd\u3002
\u4e60\u8fd1\u5e73\u6307\u51fa\uff0c\u8fc7\u53bb\u7684\u4e00\u5e74\uff0c\u4e2d\u7f8e\u5173\u7cfb\u603b\u4f53\u4fdd\u6301\u7a33\u5b9a\u5e76\u53d6\u5f97\u91cd\u8981\u8fdb\u5c55\u3002\u4fdd\u6301\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\uff0c\u7b26\u5408\u4e24\u56fd\u548c\u4e24\u56fd\u4eba\u6c11\u5229\u76ca\uff0c\u4e5f\u662f\u56fd\u9645\u793e\u4f1a\u5171\u540c\u671f\u5f85\u3002\u53cc\u65b9\u8981\u4fdd\u6301\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u5145\u5206\u53d1\u63254\u4e2a\u9ad8\u7ea7\u522b\u5bf9\u8bdd\u673a\u5236\u4f5c\u7528\u5e76\u9002\u65f6\u4e3e\u529e\u7b2c\u4e8c\u8f6e\u5bf9\u8bdd\u3002\u4e2d\u7f8e\u7ecf\u8d38\u5408\u4f5c\u7ed9\u4e24\u56fd\u4eba\u6c11\u5e26\u6765\u8bb8\u591a\u5b9e\u5b9e\u5728\u5728\u7684\u5229\u76ca\u3002\u53cc\u65b9\u5e94\u8be5\u91c7\u53d6\u5efa\u8bbe\u6027\u65b9\u5f0f\uff0c\u901a\u8fc7\u5bf9\u5f7c\u6b64\u5f00\u653e\u5e02\u573a\u3001\u505a\u5927\u5408\u4f5c\u86cb\u7cd5\uff0c\u59a5\u5584\u89e3\u51b3\u53cc\u65b9\u5173\u5207\u7684\u7ecf\u8d38\u95ee\u9898\u3002\u8981\u79ef\u6781\u63a8\u8fdb\u4e24\u519b\u3001\u6267\u6cd5\u3001\u7981\u6bd2\u3001\u4eba\u6587\u3001\u5730\u65b9\u7b49\u5408\u4f5c\uff0c\u5c31\u91cd\u5927\u56fd\u9645\u548c\u5730\u533a\u95ee\u9898\u4fdd\u6301\u5bc6\u5207\u6c9f\u901a\u534f\u5546\u3002\u53cc\u65b9\u8981\u76f8\u5411\u800c\u884c\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u805a\u7126\u5408\u4f5c\uff0c\u4ee5\u5efa\u8bbe\u6027\u65b9\u5f0f\u5904\u7406\u654f\u611f\u95ee\u9898\uff0c\u5c0a\u91cd\u5f7c\u6b64\u6838\u5fc3\u5229\u76ca\u548c\u91cd\u5927\u5173\u5207\uff0c\u7ef4\u62a4\u4e2d\u7f8e\u5173\u7cfb\u5065\u5eb7\u7a33\u5b9a\u53d1\u5c55\u52bf\u5934\u3002
\n
\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u9ad8\u5ea6\u91cd\u89c6\u5bf9\u534e\u5173\u7cfb\u548c\u7f8e\u4e2d\u5408\u4f5c\uff0c\u613f\u540c\u4e2d\u65b9\u4e00\u9053\uff0c\u52a0\u5f3a\u9ad8\u5c42\u53ca\u5404\u7ea7\u522b\u4ea4\u5f80\uff0c\u62d3\u5c55\u52a1\u5b9e\u9886\u57df\u5408\u4f5c\uff0c\u5904\u7406\u597d\u4e24\u56fd\u7ecf\u8d38\u4e2d\u7684\u95ee\u9898\uff0c\u63a8\u52a8\u53cc\u8fb9\u5173\u7cfb\u53d6\u5f97\u66f4\u5927\u53d1\u5c55\u3002
\u4e60\u8fd1\u5e73\u5e94\u8be2\u4ecb\u7ecd\u4e86\u5bf9\u5f53\u524d\u671d\u9c9c\u534a\u5c9b\u5c40\u52bf\u7684\u770b\u6cd5\uff0c\u6307\u51fa\u671d\u9c9c\u534a\u5c9b\u5f62\u52bf\u51fa\u73b0\u4e00\u4e9b\u79ef\u6781\u53d8\u5316\u3002\u5404\u65b9\u5e94\u8be5\u5171\u540c\u52aa\u529b\u628a\u6765\u4e4b\u4e0d\u6613\u7684\u7f13\u548c\u52bf\u5934\u5ef6\u7eed\u4e0b\u53bb\uff0c\u4e3a\u91cd\u542f\u5bf9\u8bdd\u8c08\u5224\u521b\u9020\u6761\u4ef6\u3002\u5b9e\u73b0\u671d\u9c9c\u534a\u5c9b\u65e0\u6838\u5316\uff0c\u7ef4\u62a4\u671d\u9c9c\u534a\u5c9b\u548c\u5e73\u7a33\u5b9a\u7b26\u5408\u5404\u65b9\u5171\u540c\u5229\u76ca\uff0c\u7ef4\u62a4\u56fd\u9645\u793e\u4f1a\u5728\u8fd9\u4e2a\u95ee\u9898\u4e0a\u7684\u56e2\u7ed3\u5341\u5206\u91cd\u8981\u3002\u4e2d\u65b9\u613f\u7ee7\u7eed\u540c\u5305\u62ec\u7f8e\u65b9\u5728\u5185\u7684\u56fd\u9645\u793e\u4f1a\u4e00\u9053\uff0c\u5bc6\u5207\u6c9f\u901a\u3001\u76f8\u4e92\u4fe1\u4efb\u3001\u76f8\u4e92\u5c0a\u91cd\u3001\u52a0\u5f3a\u5408\u4f5c\uff0c\u63a8\u52a8\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u671d\u7740\u59a5\u5584\u89e3\u51b3\u7684\u65b9\u5411\u4e0d\u65ad\u53d6\u5f97\u8fdb\u5c55\u3002
\u7279\u6717\u666e\u8868\u793a\uff0c\u7f8e\u65b9\u91cd\u89c6\u4e2d\u65b9\u5728\u671d\u9c9c\u534a\u5c9b\u95ee\u9898\u4e0a\u7684\u91cd\u8981\u4f5c\u7528\uff0c\u613f\u7ee7\u7eed\u52a0\u5f3a\u540c\u4e2d\u65b9\u7684\u6c9f\u901a\u534f\u8c03\u3002
"}, "cmtId": "D897H80K0001899O"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/006769.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "006769", "comments": {"link": "http://coral.qq.com/2369397397"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/006769.htm", "title": ["\u7fa4\u4f17\u53cd\u6620\u996e\u6c34\u95ee\u9898\u88ab\u603c\u201c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d \u5f53\u4e8b\u793e\u533a\u4e66\u8bb0\u88ab\u514d\u804c"], "passage": "\u5468\u65ed \u622a\u5c4f\u56fe2018\u5e741\u670819\u65e5\u665a8\u65f6\u8bb8\uff0c\u6210\u90fd\u5e02\u6e29\u6c5f\u533a\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u5b98\u65b9\u5fae\u535a\u53d1\u5e03\u6d88\u606f\u79f0\uff1a\u7ecf\u6838\u5b9e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u7fa4\u4f17\u8fc7\u7a0b\u4e2d\u6001\u5ea6\u751f\u786c\uff0c\u8a00\u8bed\u4e0d\u5f53\uff0c\u9020\u6210\u8d1f\u9762\u5f71\u54cd\uff0c\u6709\u635f\u57fa\u5c42\u515a\u5458\u5e72\u90e8\u5f62\u8c61\u30021\u670819\u65e5\uff0c\u7ecf\u9547\u515a\u59d4\u7814\u7a76\uff0c\u51b3\u5b9a\u514d\u53bb\u5468\u65ed\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u804c\u52a1\u3002\u4e00\u6bb5\u88ab\u66dd\u5149\u7684\u89c6\u9891\u663e\u793a\uff0c\u8fd1\u65e5\uff0c\u5728\u6210\u90fd\u5e02\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u503c\u73ed\u5ba4\u5185\uff0c\u6709\u7fa4\u4f17\u53cd\u6620\u996e\u7528\u6c34\u76f8\u5173\u95ee\u9898\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u5728\u63a5\u5f85\u8fc7\u7a0b\u4e2d\u5bf9\u7fa4\u4f17\u79f0\uff0c\u201c\u4e3a\u4eba\u6c11\u670d\u52a1\u4e0d\u662f\u4e3a\u516c\u6c11\u670d\u52a1\uff0c\u4f60\u4e0d\u662f\u4eba\u6c11\u201d\uff0c\u5f15\u53d1\u5e7f\u6cdb\u8206\u8bba\u5173\u6ce8\u3002\u89c6\u9891\u4e2d\uff0c\u5468\u65ed\u75285\u5206\u949f\u7ed9\u6765\u8bbf\u7fa4\u4f17\u8bb2\u89e3\u201c\u516c\u6c11\u201d\u4e0e\u201c\u4eba\u6c11\u201d\u7684\u533a\u522b\uff0c\u4e0d\u65f6\u7fd8\u7740\u4e8c\u90ce\u817f\uff0c\u6001\u5ea6\u968f\u610f\uff0c\u5e76\u79f0\u201c\u4f60\u76d1\u7763\u4e0d\u5230\u6211\u201d\u30021\u670819\u65e5\u665a\uff0c\u5f53\u4e8b\u4eba\u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u7531\u4e8e\u5de5\u7a0b\u65bd\u5de5\uff0c\u5979\u6240\u5c45\u4f4f\u7684\u5730\u65b9\u51e0\u5e74\u524d\u5730\u4e0b\u6c34\u67af\u7aed\uff0c\u540e\u7531\u793e\u533a\u534f\u8c03\u9001\u6c34\u89e3\u51b3\u65e5\u5e38\u7528\u6c34\u30022017\u5e7412\u670831\u65e5\uff0c\u5979\u8ba1\u5212\u5f53\u65e5\u5728\u5bb6\u4e3e\u529e\u751f\u65e5\u5bb4\u5e2d\uff0c\u5e76\u63d0\u524d\u4e24\u5929\u5411\u793e\u533a\u63d0\u51fa\u7528\u6c34\u7533\u8bf7\uff0c\u4f4612\u670830\u65e5\u4e2d\u5348\uff0c\u996e\u6c34\u4ecd\u6ca1\u6709\u9001\u5230\u3002\u201c31\u53f7\u65e9\u4e0a5\u70b9\u53a8\u5e08\u5c31\u8981\u8fc7\u6765\uff0c\u6ca1\u529e\u6cd5\u53ea\u80fd\u53c8\u8dd1\u8fc7\u53bb\u53cd\u6620\u60c5\u51b5\u3002\u201d\u9648\u5973\u58eb\u8bf4\uff0c\u5979\u548c\u5bb6\u4eba\u5148\u5230\u5929\u738b\u793e\u533a\uff0c\u540e\u53c8\u5230\u6e29\u6c5f\u533a\u653f\u5e9c\uff0c\u4e00\u76f4\u7b49\u523031\u65e5\u51cc\u6668\uff0c\u88ab\u544a\u77e5\u793e\u533a\u5c06\u5b89\u6392\u4eba\u5904\u7406\uff0c\u8ba9\u5979\u4eec\u5230\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u53bb\u7b49\u5f85\u3002\u5230\u8fbe\u6c38\u5b81\u9547\u4eba\u6c11\u653f\u5e9c\u540e\uff0c\u5929\u738b\u793e\u533a\u515a\u603b\u652f\u4e66\u8bb0\u5468\u65ed\u63a5\u5f85\u4e86\u5979\u4eec\u3002\u201c\u4e0d\u662f\u6765\u89e3\u51b3\u95ee\u9898\uff0c\u4e00\u5f00\u59cb\u5c31\u7ed9\u6211\u4eec\u2018\u666e\u6cd5\u2019\uff0c\u8bf4\u6211\u4eec\u4e0d\u662f\u4eba\u6c11\u3002\u201d\u9648\u5973\u58eb\u79f0\uff0c\u996e\u6c34\u6700\u7ec8\u6ca1\u6709\u9001\u6765\uff0c\u5979\u53ea\u597d\u8ba9\u4eb2\u4eba\u5e2e\u5fd9\u81ea\u5df1\u8fd0\u6c34\u8fc7\u6765\uff0c\u53c8\u4e70\u4e86\u4e9b\u6876\u88c5\u6c34\u56de\u6765\u3002\u56e0\u4e3a\u6c34\u4e0d\u591f\u7528\uff0c\u539f\u8ba1\u5212\u8bf7\u5ba220\u684c\uff0c\u6700\u540e\u53ea\u529e\u4e8613\u684c\u3002\u89c6\u9891\u66dd\u5149\u540e\uff0c\u5f15\u53d1\u5e7f\u6cdb\u70ed\u8bae\u3002 \u9648\u5973\u58eb\u544a\u8bc9\u6f8e\u6e43\u65b0\u95fb\u8bb0\u8005\uff0c\u5f53\u5730\u653f\u5e9c\u4e5f\u76f8\u5f53\u91cd\u89c6\uff0c19\u65e5\u4e0b\u53482\u65f6\u8bb8\uff0c\u6e29\u6c5f\u533a\u7eaa\u59d4\u76d1\u5bdf\u5c40\u7684\u5de5\u4f5c\u4eba\u5458\u8054\u7cfb\u5979\uff0c\u5c31\u4e8b\u60c5\u7684\u7ecf\u8fc7\u8fdb\u884c\u4e86\u8be2\u95ee\uff0c\u5e76\u505a\u4e86\u7b14\u5f55\uff0c\u8be2\u95ee\u6301\u7eed\u4e863\u4e2a\u591a\u5c0f\u65f6\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/010301.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "010301", "comments": {"link": "http://coral.qq.com/2369810132"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/010301.htm", "title": ["\u7f8e\u8230\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u9ec4\u5ca9\u5c9b\u9644\u8fd1\u9886\u6d77 \u5916\u4ea4\u90e8\u3001\u56fd\u9632\u90e8\u5f3a\u786c\u8868\u6001"], "passage": "\u65b0\u534e\u793e\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff0c\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u6177\u5f53\u65e5\u5c31\u7f8e\u56fd\u4e00\u8258\u5bfc\u5f39\u9a71\u9010\u8230\u8fdb\u5165\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u7b54\u8bb0\u8005\u95ee\u65f6\u8868\u793a\uff0c\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u6709\u8bb0\u8005\u95ee\uff1a\u636e\u4e86\u89e3\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u4ece\u9ec4\u5ca9\u5c9b\u897f\u5357\u4fa7\u8fdb\u5165\u8be5\u5c9b12\u6d77\u91cc\u8303\u56f4\u3002\u4e2d\u65b9\u5bf9\u6b64\u6709\u4f55\u8bc4\u8bba\uff1f\u9646\u6177\u8bf4\uff0c1\u670817\u65e5\u665a\uff0c\u7f8e\u56fd\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u672a\u7ecf\u4e2d\u56fd\u653f\u5e9c\u5141\u8bb8\uff0c\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b12\u6d77\u91cc\u5185\u6d77\u57df\u3002\u4e2d\u56fd\u6d77\u519b\u4f9d\u6cd5\u5bf9\u7f8e\u8230\u8fdb\u884c\u4e86\u8bc6\u522b\u67e5\u8bc1\uff0c\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u9646\u6177\u8868\u793a\uff0c\u7f8e\u65b9\u519b\u8230\u6709\u5173\u884c\u4e3a\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\uff0c\u5bf9\u4e2d\u65b9\u5728\u6709\u5173\u6d77\u57df\u5f00\u5c55\u6b63\u5e38\u516c\u52a1\u6d3b\u52a8\u7684\u8239\u53ea\u548c\u4eba\u5458\u5b89\u5168\u9020\u6210\u4e25\u91cd\u5a01\u80c1\uff0c\u8fdd\u80cc\u56fd\u9645\u5173\u7cfb\u57fa\u672c\u51c6\u5219\u3002\u4e2d\u65b9\u5bf9\u6b64\u8868\u793a\u5f3a\u70c8\u4e0d\u6ee1\uff0c\u5c06\u91c7\u53d6\u5fc5\u8981\u63aa\u65bd\uff0c\u575a\u5b9a\u7ef4\u62a4\u4e2d\u56fd\u4e3b\u6743\u3002\u9646\u6177\u8868\u793a\uff0c\u4e2d\u56fd\u5bf9\u9ec4\u5ca9\u5c9b\u53ca\u5176\u9644\u8fd1\u6d77\u57df\u62e5\u6709\u65e0\u53ef\u4e89\u8fa9\u7684\u4e3b\u6743\u3002\u4e2d\u65b9\u4e00\u5411\u5c0a\u91cd\u548c\u7ef4\u62a4\u5404\u56fd\u4f9d\u636e\u56fd\u9645\u6cd5\u5728\u5357\u6d77\u4eab\u6709\u7684\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\uff0c\u4f46\u575a\u51b3\u53cd\u5bf9\u4efb\u4f55\u56fd\u5bb6\u4ee5\u822a\u884c\u548c\u98de\u8d8a\u81ea\u7531\u4e3a\u540d\uff0c\u635f\u5bb3\u4e2d\u56fd\u7684\u4e3b\u6743\u548c\u5b89\u5168\u5229\u76ca\u3002\u201c\u6211\u4eec\u5f3a\u70c8\u6566\u4fc3\u7f8e\u65b9\u7acb\u5373\u7ea0\u6b63\u9519\u8bef\uff0c\u505c\u6b62\u6b64\u7c7b\u6311\u8845\u884c\u4e3a\uff0c\u4ee5\u514d\u635f\u5bb3\u4e2d\u7f8e\u5173\u7cfb\u548c\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002\u201d\u56fd\u9632\u90e8\u7f511\u670820\u65e5\u6d88\u606f\uff0c1\u670817\u65e5\uff0c\u7f8e\u56fd\u6d77\u519b\u201c\u970d\u73c0\u201d\u53f7\u5bfc\u5f39\u9a71\u9010\u8230\u64c5\u81ea\u8fdb\u5165\u4e2d\u56fd\u9ec4\u5ca9\u5c9b\u90bb\u8fd1\u6d77\u57df\uff0c\u4e2d\u56fd\u6d77\u519b\u201c\u9ec4\u5c71\u201d\u53f7\u5bfc\u5f39\u62a4\u536b\u8230\u5f53\u5373\u884c\u52a8\uff0c\u5bf9\u7f8e\u8230\u8fdb\u884c\u8bc6\u522b\u67e5\u8bc1\uff0c\u5e76\u4e88\u4ee5\u8b66\u544a\u9a71\u79bb\u3002\u5f53\u524d\uff0c\u5728\u4e2d\u56fd\u548c\u4e1c\u76df\u56fd\u5bb6\u7684\u5171\u540c\u52aa\u529b\u4e0b\uff0c\u5357\u6d77\u5c40\u52bf\u4e0d\u65ad\u8d8b\u7a33\u5411\u597d\u3002\u5728\u6b64\u5f62\u52bf\u4e0b\uff0c\u7f8e\u65b9\u4e00\u518d\u6d3e\u9063\u519b\u8230\u975e\u6cd5\u8fdb\u5165\u4e2d\u56fd\u5357\u6d77\u5c9b\u7901\u90bb\u8fd1\u6d77\u57df\uff0c\u5371\u53ca\u53cc\u65b9\u8230\u673a\u548c\u4eba\u5458\u5b89\u5168\uff0c\u5a01\u80c1\u4e2d\u56fd\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u7834\u574f\u5730\u533a\u548c\u5e73\u7a33\u5b9a\uff0c\u4e0e\u4e24\u56fd\u4e24\u519b\u5173\u7cfb\u7a33\u5b9a\u53d1\u5c55\u7684\u52bf\u5934\u80cc\u9053\u800c\u9a70\u3002\u6211\u4eec\u5e0c\u671b\u7f8e\u65b9\u5c0a\u91cd\u4e2d\u65b9\u4e3b\u6743\uff0c\u5c0a\u91cd\u57df\u5185\u56fd\u5bb6\u7684\u52aa\u529b\uff0c\u4e0d\u8981\u65e0\u4e8b\u751f\u975e\uff0c\u5174\u98ce\u4f5c\u6d6a\u3002\u4e2d\u56fd\u519b\u961f\u5c06\u7ee7\u7eed\u5c65\u884c\u9632\u536b\u804c\u8d23\uff0c\u52a0\u5927\u6d77\u7a7a\u5de1\u903b\u8b66\u6212\u529b\u5ea6\uff0c\u575a\u5b9a\u634d\u536b\u56fd\u5bb6\u7684\u4e3b\u6743\u548c\u5b89\u5168\uff0c\u575a\u5b9a\u7ef4\u62a4\u5730\u533a\u548c\u5e73\u7a33\u5b9a\u3002"}} -------------------------------------------------------------------------------- /机器学习入门/标签传播算法(LP).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/3/25 11:28 4 | # @Author : mazicwong 5 | # @File : 标签传播算法(LP).py 6 | import time 7 | import math 8 | import numpy as np 9 | from label_propagation import labelPropagation 10 | 11 | 12 | # show 13 | def show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels): 14 | import matplotlib.pyplot as plt 15 | 16 | for i in range(Mat_Label.shape[0]): 17 | if int(labels[i]) == 0: 18 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dr') 19 | elif int(labels[i]) == 1: 20 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Db') 21 | else: 22 | plt.plot(Mat_Label[i, 0], Mat_Label[i, 1], 'Dy') 23 | 24 | for i in range(Mat_Unlabel.shape[0]): 25 | if int(unlabel_data_labels[i]) == 0: 26 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'or') 27 | elif int(unlabel_data_labels[i]) == 1: 28 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'ob') 29 | else: 30 | plt.plot(Mat_Unlabel[i, 0], Mat_Unlabel[i, 1], 'oy') 31 | 32 | plt.xlabel('X1'); 33 | plt.ylabel('X2') 34 | plt.xlim(0.0, 12.) 35 | plt.ylim(0.0, 12.) 36 | plt.show() 37 | 38 | 39 | def loadCircleData(num_data): 40 | center = np.array([5.0, 5.0]) 41 | radiu_inner = 2 42 | radiu_outer = 4 43 | num_inner = num_data / 3 44 | num_outer = num_data - num_inner 45 | 46 | data = [] 47 | theta = 0.0 48 | for i in range(int(num_inner)): 49 | pho = (theta % 360) * math.pi / 180 50 | tmp = np.zeros(2, np.float32) 51 | tmp[0] = radiu_inner * math.cos(pho) + np.random.rand(1) + center[0] 52 | tmp[1] = radiu_inner * math.sin(pho) + np.random.rand(1) + center[1] 53 | data.append(tmp) 54 | theta += 2 55 | 56 | theta = 0.0 57 | for i in range(int(num_outer)): 58 | pho = (theta % 360) * math.pi / 180 59 | tmp = np.zeros(2, np.float32) 60 | tmp[0] = radiu_outer * math.cos(pho) + np.random.rand(1) + center[0] 61 | tmp[1] = radiu_outer * math.sin(pho) + np.random.rand(1) + center[1] 62 | data.append(tmp) 63 | theta += 1 64 | 65 | Mat_Label = np.zeros((2, 2), np.float32) 66 | Mat_Label[0] = center + np.array([-radiu_inner + 0.5, 0]) 67 | Mat_Label[1] = center + np.array([-radiu_outer + 0.5, 0]) 68 | labels = [0, 1] 69 | Mat_Unlabel = np.vstack(data) 70 | return Mat_Label, labels, Mat_Unlabel 71 | 72 | 73 | def loadBandData(num_unlabel_samples): 74 | # Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]]) 75 | # labels = [0, 1] 76 | # Mat_Unlabel = np.array([[5.1, 2.], [5.0, 8.1]]) 77 | 78 | Mat_Label = np.array([[5.0, 2.], [5.0, 8.0]]) 79 | labels = [0, 1] 80 | num_dim = Mat_Label.shape[1] 81 | Mat_Unlabel = np.zeros((num_unlabel_samples, num_dim), np.float32) 82 | Mat_Unlabel[:num_unlabel_samples / 2, :] = (np.random.rand(num_unlabel_samples / 2, num_dim) - 0.5) * np.array( 83 | [3, 1]) + Mat_Label[0] 84 | Mat_Unlabel[num_unlabel_samples / 2: num_unlabel_samples, :] = (np.random.rand(num_unlabel_samples / 2, 85 | num_dim) - 0.5) * np.array([3, 1]) + \ 86 | Mat_Label[1] 87 | return Mat_Label, labels, Mat_Unlabel 88 | 89 | 90 | # main function 91 | if __name__ == "__main__": 92 | num_unlabel_samples = 800 93 | # Mat_Label, labels, Mat_Unlabel = loadBandData(num_unlabel_samples) 94 | Mat_Label, labels, Mat_Unlabel = loadCircleData(num_unlabel_samples) 95 | 96 | ## Notice: when use 'rbf' as our kernel, the choice of hyper parameter 'sigma' is very import! It should be 97 | ## chose according to your dataset, specific the distance of two data points. I think it should ensure that 98 | ## each point has about 10 knn or w_i,j is large enough. It also influence the speed of converge. So, may be 99 | ## 'knn' kernel is better! 100 | # unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.2) 101 | unlabel_data_labels = labelPropagation(Mat_Label, Mat_Unlabel, labels, kernel_type='knn', knn_num_neighbors=10, 102 | max_iter=400) 103 | show(Mat_Label, labels, Mat_Unlabel, unlabel_data_labels) -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8HJ6VRF0001875O.json: -------------------------------------------------------------------------------- 1 | {"source": "netease", "cmtId": "D8HJ6VRF0001875O", "date": "20180119", "comments": {"link": "http://comment.news.163.com/news_guoji2_bbs/D8HJ6VRF0001875O.html"}, "newsId": "D8HJ6VRF0001875O", "contents": {"passage": "\n \uff08\u539f\u6807\u9898\uff1a\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94\uff09\n
\u6d77\u5916\u7f511\u670819\u65e5\u7535\u00a0\u5916\u4ea4\u90e8\u53d1\u8a00\u4eba\u9646\u617719\u65e5\u4e3b\u6301\u4f8b\u884c\u8bb0\u8005\u4f1a\uff0c\u5c31\u8fd1\u671f\u70ed\u70b9\u8fdb\u884c\u56de\u5e94\u3002\u76f8\u5173\u5185\u5bb9\u5982\u4e0b\uff1a
\u95ee\uff1a\u5a92\u4f53\u62ab\u9732\u7684\u6700\u65b0\u536b\u661f\u56fe\u50cf\u663e\u793a\uff0c\u4e2d\u56fd\u6b63\u5728\u8ddd\u6d1e\u6717\u5bf9\u5cd9\u53d1\u751f\u5730\u5f88\u8fd1\u7684\u5730\u65b9\u4fee\u5efa\u5e9e\u5927\u7684\u519b\u4e8b\u8bbe\u65bd\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u53d1\u8868\u58f0\u660e\u91cd\u7533\u8be5\u8bbe\u65bd\u5e76\u4e0d\u5728\u5bf9\u5cd9\u5730\u533a\u3002\u4f46\u8fd9\u5728\u5370\u5ea6\u653f\u515a\u4e2d\u5f15\u53d1\u4e86\u62c5\u5fe7\u3002\u5370\u5ea6\u5916\u4ea4\u90e8\u8fd8\u79f0\u201c\u6b64\u524d\u5bf9\u5cd9\u5730\u70b9\u7684\u73b0\u72b6\u5e76\u672a\u53d1\u751f\u6539\u53d8\u201d\u3002\u4e2d\u65b9\u5bf9\u6709\u5173\u62a5\u9053\u6709\u4f55\u8bc4\u8bba\uff1f
\u7b54\uff1a\u6211\u521a\u521a\u6ce8\u610f\u5230\u6709\u5173\u62a5\u9053\uff0c\u4e0d\u4e86\u89e3\u5177\u4f53\u60c5\u51b5\uff0c\u4e5f\u4e0d\u6e05\u695a\u4f60\u6240\u8bf4\u7684\u536b\u661f\u56fe\u50cf\u6765\u6e90\u3002
\u76f8\u4fe1\u4f60\u975e\u5e38\u6e05\u695a\u4e2d\u65b9\u5728\u6d1e\u6717\u95ee\u9898\u4e0a\u7684\u7acb\u573a\u3002\u6d1e\u6717\u5730\u533a\u5386\u6765\u5c5e\u4e8e\u4e2d\u56fd\uff0c\u4e00\u76f4\u5728\u4e2d\u56fd\u6709\u6548\u7ba1\u8f96\u4e4b\u4e0b\uff0c\u4e0d\u5b58\u5728\u4e89\u8bae\u3002\u4e3a\u4e86\u5b88\u8fb9\u9700\u8981\u548c\u6539\u5584\u5f53\u5730\u519b\u6c11\u7684\u751f\u4ea7\u751f\u6d3b\u6761\u4ef6\uff0c\u4e2d\u65b9\u957f\u671f\u4ee5\u6765\u4e00\u76f4\u5728\u6d1e\u6717\u5730\u533a\u8fdb\u884c\u5305\u62ec\u9053\u8def\u5728\u5185\u7684\u57fa\u7840\u8bbe\u65bd\u5efa\u8bbe\uff0c\u8fd9\u662f\u4e2d\u65b9\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\uff0c\u5b8c\u5168\u6b63\u5f53\u5408\u6cd5\u3002\u6b63\u5982\u4e2d\u65b9\u4e0d\u4f1a\u5bf9\u5370\u65b9\u5728\u5370\u5ea6\u9886\u571f\u4e0a\u7684\u5efa\u8bbe\u6d3b\u52a8\u54c1\u5934\u8bba\u8db3\u4e00\u6837\uff0c\u5176\u4ed6\u56fd\u5bb6\u5bf9\u4e2d\u56fd\u5728\u81ea\u5df1\u9886\u571f\u4e0a\u7684\u4e3b\u6743\u884c\u4e3a\u54c1\u5934\u8bba\u8db3\u4e5f\u662f\u4e0d\u5408\u9002\u7684\u3002
\n
\u95ee\uff1a\u8003\u8651\u5230\u8fd9\u4e2a\u62a5\u9053\u8868\u8fbe\u4e86\u5bf9\u6d1e\u6717\u5730\u533a\u518d\u6b21\u53d1\u751f\u5bf9\u5cd9\u7684\u62c5\u5fe7\u3002\u53bb\u5e74\u7684\u5bf9\u5cd9\u4e8b\u4ef6\u5e94\u8be5\u5df2\u7ecf\u5f97\u5230\u4e86\u89e3\u51b3\uff0c\u4f60\u8ba4\u4e3a\u4f1a\u518d\u6b21\u53d1\u751f\u7c7b\u4f3c\u4e8b\u4ef6\u5417\uff1f
\u7b54\uff1a\u6709\u5173\u5370\u5ea6\u8fb9\u9632\u90e8\u961f\u8d8a\u754c\u9020\u6210\u7684\u6d1e\u6717\u5bf9\u5cd9\u4e8b\u4ef6\uff0c\u524d\u4e24\u5929\u6211\u5df2\u7ecf\u8bf4\u8fc7\uff0c\u5370\u5ea6\u519b\u65b9\u7684\u9ad8\u5b98\u4e5f\u627f\u8ba4\u662f\u5370\u5ea6\u519b\u961f\u8d8a\u754c\u4e86\u3002\u8fd9\u4e00\u4e8b\u4ef6\u4f7f\u4e2d\u5370\u53cc\u8fb9\u5173\u7cfb\u7ecf\u53d7\u4e86\u4e25\u5cfb\u8003\u9a8c\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u519b\u65b9\u80fd\u591f\u6c72\u53d6\u6559\u8bad\uff0c\u907f\u514d\u7c7b\u4f3c\u4e8b\u60c5\u518d\u6b21\u53d1\u751f\u3002\u4e2d\u5370\u4e24\u56fd\u9886\u5bfc\u4eba\u5728\u53bb\u5e749\u6708\u91d1\u7816\u56fd\u5bb6\u9886\u5bfc\u4eba\u53a6\u95e8\u4f1a\u6664\u671f\u95f4\uff0c\u5df2\u7ecf\u5c31\u5982\u4f55\u5728\u65b0\u5f62\u52bf\u4e0b\u8fdb\u4e00\u6b65\u6539\u5584\u548c\u53d1\u5c55\u4e2d\u5370\u5173\u7cfb\u8fbe\u6210\u4e86\u91cd\u8981\u5171\u8bc6\u3002\u6211\u4eec\u5e0c\u671b\u5370\u5ea6\u6709\u5173\u65b9\u9762\u80fd\u5207\u5b9e\u9075\u7167\u4e24\u56fd\u9886\u5bfc\u4eba\u8fbe\u6210\u7684\u91cd\u8981\u5171\u8bc6\uff0c\u540c\u4e2d\u65b9\u76f8\u5411\u800c\u884c\uff0c\u5171\u540c\u7ef4\u62a4\u8fb9\u5883\u5730\u533a\u7684\u548c\u5e73\u7a33\u5b9a\uff0c\u5171\u540c\u81f4\u529b\u4e8e\u4e2d\u5370\u5173\u7cfb\u7684\u6539\u5584\u53d1\u5c55\u3002
", "link": "http://news.163.com/18/0119/18/D8HJ6VRF0001875O.html", "title": ["\u5370\u5a92\u79f0\u4e2d\u56fd\u5728\u6d1e\u6717\u9644\u8fd1\u4fee\u5efa\u5e9e\u5927\u519b\u4e8b\u8bbe\u65bd \u4e2d\u65b9\u56de\u5e94"]}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/009612.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "009612", "comments": {"link": "http://coral.qq.com/2369744788"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/009612.htm", "title": ["\u592e\u884c\u53d1\u5e03\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u901a\u77e5 \u2161\u3001\u2162\u7c7b\u6237\u5f00\u6237\u5c06\u66f4\u4fbf\u6377"], "passage": "\u592e\u5e7f\u7f51\u5317\u4eac1\u670820\u65e5\u6d88\u606f\uff08\u8bb0\u8005\u67f4\u534e\uff09\u636e\u4e2d\u56fd\u4e4b\u58f0\u300a\u592e\u5e7f\u65b0\u95fb\u300b\u62a5\u9053\uff0c\u6628\u5929\uff0819\u65e5\uff09\u665a\u95f4\uff0c\u592e\u884c\u5b98\u7f51\u53d1\u5e03\u300a\u5173\u4e8e\u6539\u8fdb\u4e2a\u4eba\u94f6\u884c\u8d26\u6237\u5206\u7c7b\u7ba1\u7406\u6709\u5173\u4e8b\u9879\u7684\u901a\u77e5\u300b\uff0c\u5ba3\u5e03\u8fdb\u4e00\u6b65\u53d1\u6325\u2162\u7c7b\u6237\u5728\u5c0f\u989d\u652f\u4ed8\u9886\u57df\u7684\u4f5c\u7528\uff0c\u63a8\u52a8\u2161\u3001\u2162\u7c7b\u6237\u6210\u4e3a\u4e2a\u4eba\u529e\u7406\u7f51\u4e0a\u652f\u4ed8\u3001\u79fb\u52a8\u652f\u4ed8\u7b49\u5c0f\u989d\u6d88\u8d39\u4e1a\u52a1\u7684\u4e3b\u8981\u6e20\u9053\u3002\u6839\u636e\u300a\u901a\u77e5\u300b\u548c\u7b54\u8bb0\u8005\u95ee\u7684\u89e3\u91ca\uff0c\u4e00\u662f\u5f00\u6237\u6e20\u9053\u591a\u6837\u3002\u300a\u901a\u77e5\u300b\u8981\u6c42\u56fd\u6709\u5546\u4e1a\u94f6\u884c\u3001\u80a1\u4efd\u5236\u5546\u4e1a\u94f6\u884c\u7b49\u5e94\u4e8e2018\u5e746\u6708\u5e95\u524d\u5b9e\u73b0\u672c\u94f6\u884c\u67dc\u9762\u548c\u7f51\u4e0a\u94f6\u884c\u3001\u624b\u673a\u94f6\u884c\u3001\u76f4\u9500\u94f6\u884c\u3001\u8fdc\u7a0b\u89c6\u9891\u67dc\u5458\u673a\u548c\u667a\u80fd\u67dc\u5458\u673a\u7b49\u7535\u5b50\u6e20\u9053\u529e\u7406\u4e2a\u4eba\u2161\u3001\u2162\u7c7b\u6237\u5f00\u7acb\u7b49\u4e1a\u52a1\uff0c\u5176\u4ed6\u94f6\u884c\u5219\u5e94\u57282018\u5e74\u5e95\u524d\u5b9e\u73b0\u3002\u4e8c\u662f\u5f00\u6237\u624b\u7eed\u7b80\u5316\u3002\u300a\u901a\u77e5\u300b\u660e\u786e\u4e00\u5b9a\u524d\u63d0\u4e0b\u5f00\u7acb\u2161\u3001\u2162\u7c7b\u6237\u65f6\u65e0\u9700\u4e2a\u4eba\u586b\u5199\u8eab\u4efd\u4fe1\u606f\u3001\u51fa\u793a\u8eab\u4efd\u8bc1\u4ef6\u7b49\uff0c\u5728\u6709\u6548\u843d\u5b9e\u8d26\u6237\u5b9e\u540d\u5236\u8981\u6c42\u7684\u540c\u65f6\uff0c\u5927\u5e45\u63d0\u5347\u5f00\u6237\u4f53\u9a8c\u3002\u5176\u6b21\uff0c\u5728\u8d26\u6237\u4f7f\u7528\u65b9\u9762\uff0c\u5728\u6ee1\u8db3\u53cd\u6d17\u94b1\u3001\u53cd\u8bc8\u9a97\u8981\u6c42\u7684\u524d\u63d0\u4e0b\uff0c\u653e\u5bbd\u2162\u7c7b\u6237\u7684\u4f7f\u7528\u9650\u5236\u3002\u4e00\u662f\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u2162\u7c7b\u6237\u80fd\u591f\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u4ee5\u6ee1\u8db3\u4e2a\u4eba\u4e4b\u95f4\u5c0f\u989d\u6536\u4ed8\u6b3e\u3001\u53d1\u653e\u7ea2\u5305\u3001\u4e0e\u4e2a\u4eba\u652f\u4ed8\u8d26\u6237\u5bf9\u63a5\u3001\u94f6\u884c\u6216\u5546\u6237\u5c0f\u989d\u8fd4\u73b0\u5956\u52b1\u7b49\u573a\u666f\u9700\u6c42\u3002\u4e8c\u662f\u2162\u7c7b\u6237\u8d26\u6237\u4f59\u989d\u4ece1000\u5143\u63d0\u5347\u4e3a2000\u5143\u3002\u4e09\u662f\u5141\u8bb8\u94f6\u884c\u5411\u2162\u7c7b\u6237\u53d1\u653e\u672c\u884c\u5c0f\u989d\u6d88\u8d39\u8d37\u6b3e\u5e76\u901a\u8fc7\u2162\u7c7b\u6237\u8fd8\u6b3e\uff0c\u9f13\u52b1\u94f6\u884c\u57fa\u4e8e\u2162\u7c7b\u6237\u63d0\u4f9b\u66f4\u591a\u5143\u5316\u7684\u4ea7\u54c1\u8bbe\u8ba1\u548c\u529f\u80fd\u7ec4\u5408\u3002\u592e\u884c\u8868\u793a\uff0c\u300a\u901a\u77e5\u300b\u91c7\u53d6\u4e86\u591a\u79cd\u5b89\u5168\u9632\u8303\u63aa\u65bd\u3002\u4e00\u662f\u5c06\u2162\u7c7b\u6237\u6d88\u8d39\u548c\u7f34\u8d39\u652f\u4ed8\u3001\u975e\u7ed1\u5b9a\u8d26\u6237\u8d44\u91d1\u8f6c\u51fa\u7b49\u51fa\u91d1\u7684\u65e5\u7d2f\u8ba1\u9650\u989d\u4ece\u539f5000\u5143\u4e0b\u8c03\u81f32000\u5143\uff0c\u5e74\u7d2f\u8ba1\u9650\u989d\u4ece\u539f10\u4e07\u5143\u4e0b\u8c03\u4e3a5\u4e07\u5143\uff0c\u901a\u8fc7\u63a7\u5236\u2162\u7c7b\u6237\u652f\u51fa\u989d\u5ea6\uff0c\u786e\u4fdd\u98ce\u9669\u76f8\u5bf9\u53ef\u63a7\u3002\u4e8c\u662f\u89c4\u5b9a\u975e\u9762\u5bf9\u9762\u7ebf\u4e0a\u5f00\u7acb\u7684\u2162\u7c7b\u6237\u901a\u8fc7\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u540e\uff0c\u624d\u53ef\u63a5\u53d7\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\uff0c\u9632\u8303\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u83b7\u53d6\u4ed6\u4eba\u8eab\u4efd\u4fe1\u606f\u548c\u94f6\u884c\u8d26\u6237\u4fe1\u606f\u540e\u5192\u540d\u5f00\u7acb\u3002\u4e09\u662f\u89c4\u5b9a\u540c\u4e00\u5bb6\u94f6\u884c\u901a\u8fc7\u7ebf\u4e0a\u4e3a\u540c\u4e00\u4e2a\u4eba\u53ea\u80fd\u5f00\u7acb\u4e00\u4e2a\u5141\u8bb8\u975e\u7ed1\u5b9a\u8d26\u6237\u5165\u91d1\u7684\u2162\u7c7b\u6237\uff0c\u9632\u6b62\u4e0d\u6cd5\u5206\u5b50\u901a\u8fc7\u5f00\u7acb\u591a\u4e2a\u6b64\u7c7b\u8d26\u6237\u53d8\u76f8\u6269\u5927\u2162\u7c7b\u6237\u7684\u8f6c\u8d26\u9650\u989d\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20171009/039986.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "039986", "comments": {"link": "http://coral.qq.com/2166352744"}, "date": "20171009", "contents": {"link": "https://news.qq.com/a/20171009/039986.htm", "title": ["\u8bbe\u8ba1\u7f8e\u5b66\uff0c\u8ba9\u6b27\u7c73\u8304\u6d77\u9a6cAqua Terra\u8155\u8868\u7115\u7136\u4e00\u65b0"], "passage": "[]\u6b27\u7c73\u8304\u63a8\u51fa\u6d77\u9a6c\u7cfb\u5217Aqua Terra\u5168\u65b0\u8868\u6b3e\uff0c\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u57fa\u7840\u4e0a\uff0c\u878d\u5165\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\u3002\u6b27\u7c73\u8304\u5168\u65b0\u53d1\u5e03\u7684\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\uff0c\u7b80\u7ea6\u3001\u5927\u6c14\uff0c\u5448\u73b0\u5e73\u8861\u4e4b\u7f8e\u3002\u8868\u6b3e\u5728\u5907\u53d7\u6b22\u8fce\u7684\u8bbe\u8ba1\u5143\u7d20\u4e2d\u878d\u5165\u8bf8\u591a\u5de7\u5999\u9769\u65b0\uff0c\u8d4b\u4e88\u65f6\u8ba1\u5168\u65b0\u5916\u89c2\uff0c\u901a\u8fc7\u81f3\u81fb\u5929\u6587\u53f0\u8ba4\u8bc1\uff0c\u521b\u9020\u7684\u4f18\u96c5\u5353\u8d8a\u9b45\u529b\u65f6\u8ba1\uff0c\u4ee4\u4eba\u96be\u4ee5\u6297\u62d2\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u914d\u5907\u4e09\u89d2\u5f62\u5c0f\u65f6\u523b\u5ea6\uff0c\u98ce\u683c\u5927\u6c14\u7eaf\u7cb9\uff0c\u540c\u65f6\u62e5\u6709\u5f88\u9ad8\u7684\u6613\u8bfb\u6027\u3002\u8fd9\u6b21\uff0c\u6b27\u7c73\u8304\u5c06\u8868\u76d8\u8bbe\u8ba1\u518d\u7b80\u5316\uff0c\u5728\u4fdd\u7559\u7ecf\u5178\u7684\u5f27\u5f62\u8868\u8033\u7684\u57fa\u7840\u4e0a\u5bf9\u8868\u58f3\u8fdb\u884c\u4e86\u91cd\u65b0\u8bbe\u8ba1\uff0c\u4e3a\u8868\u80cc\u589e\u6dfb\u6ce2\u7eb9\u8fb9\u7f18\uff0c\u4ee4\u6574\u679a\u8155\u8868\u5c55\u73b0\u5bf9\u79f0\u4e4b\u7f8e\u3002\u539f\u672c\u8868\u76d8\u4e0a\u7684\u9632\u6c34\u7cfb\u6570\u5b57\u6837\u88ab\u8f6c\u79fb\u81f3\u8868\u80cc\uff0c\u65e5\u671f\u7a97\u53e3\u4e5f\u75313\u70b9\u4f4d\u7f6e\u8c03\u6574\u81f36\u70b9\u4f4d\u7f6e\uff0c\u7528\u4ee5\u81f4\u656c1952\u5e74\u63a8\u51fa\u7684\u9996\u6b3e\u5e26\u6709\u65e5\u671f\u7a97\u663e\u793a\u7684\u6b27\u7c73\u8304\u8155\u8868\uff0c\u8d2f\u5f7b\u5bf9\u79f0\u7b80\u7ea6\u7684\u8bbe\u8ba1\u7f8e\u5b66\u3002\u67da\u6728\u7eb9\u7406\u8868\u76d8\u582a\u79f0\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u8155\u8868\u6700\u4e3a\u663e\u8457\u7684\u7279\u5f81\uff0c\u5176\u8bbe\u8ba1\u7075\u611f\u6765\u6e90\u4e8e\u6e38\u8247\u4e0a\u7684\u67da\u6728\u7532\u677f\u30022017\u5e74\uff0c\u6b27\u7c73\u8304\u5c06\u6807\u5fd7\u6027\u7684\u5782\u76f4\u7eb9\u7406\u53d8\u4e3a\u6c34\u5e73\u7eb9\u7406\uff0c\u4ee4\u6574\u679a\u8155\u8868\u66f4\u663e\u7cbe\u81f4\uff0c\u7115\u53d1\u5d2d\u65b0\u9b45\u529b\u3002\u8bbe\u8ba1\u7b80\u6d01\u53c8\u4e0d\u5931\u7cbe\u81f4\uff0c\u5448\u73b0\u4e86\u4e0e\u6d77\u6d0b\u76f8\u5951\u5408\u7684\u4f11\u95f2\u751f\u6d3b\u65b9\u5f0f\u3002\u6b27\u7c73\u8304\u5728\u6b64\u6b3e\u8868\u5e26\u8bbe\u8ba1\u4e0a\u4e5f\u5320\u5fc3\u72ec\u8fd0\uff0c\u90e8\u5206\u8868\u6b3e\u642d\u914d\u6a61\u80f6\u8868\u5e26\uff0c\u521b\u9020\u6027\u5730\u901a\u8fc7\u7cbe\u94a2\u6216Sedna\u00ae 18K\u91d1\u94fe\u8282\u5c06\u8868\u5e26\u4e0e\u8868\u58f3\u76f8\u8fde\uff0c\u4ee4\u8155\u8868\u62e5\u6709\u8212\u9002\u79f0\u624b\u7684\u4f69\u5e26\u611f\u53d7\uff0c\u66f4\u52a0\u5bcc\u6709\u8fd0\u52a8\u6c14\u606f\u3002\u91d1\u5c5e\u8868\u94fe\u5219\u8fd0\u7528\u4e86\u6b27\u7c73\u8304\u4e13\u5229\u7684\u94fe\u9488\u8868\u94fe\u4e0e\u66f4\u4e3a\u575a\u56fa\u7684\u94fe\u8282\uff0c\u6574\u4f53\u7f8e\u611f\u500d\u589e\u3002\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u7cfb\u5217\u62e5\u670941mm\u548c38mm\u4e24\u79cd\u8868\u58f3\u5c3a\u5bf8\u3002\u8155\u8868\u8868\u58f3\u91c7\u7528\u7cbe\u94a2\u3001Sedna\u00ae 18K\u91d1\u6216\u7cbe\u94a2\u4e0eSedna\u00ae 18K\u91d1\u6df7\u5408\u6253\u9020\u800c\u6210\uff0c\u5177\u6709\u4e30\u5bcc\u7684\u8868\u6b3e\u53ef\u4f9b\u9009\u62e9\u3002\u540c\u65f6\u8155\u8868\u8fd8\u53ef\u642d\u914d\u7cbe\u94a2\u8868\u94fe\u3001\u76ae\u9769\u8868\u5e26\u6216\u9020\u578b\u7cbe\u81f4\u7684\u6a61\u80f6\u8868\u5e26\u3002\u591a\u79cd\u4e0d\u540c\u8868\u6b3e\uff0c\u642d\u914d\u7537\u58eb\u72ec\u4e00\u65e0\u4e8c\u7684\u98ce\u683c\u3002\u9646\u5730\u4e0e\u6d77\u6d0b\u3001\u4f20\u627f\u4e0e\u521b\u65b0\u3001\u5de5\u4f5c\u4e0e\u4f11\u95f2\uff0c\u8fd9\u5c31\u662f\u6b27\u7c73\u8304\u6d77\u9a6c\u7cfb\u5217 Aqua Terra \u81f3\u81fb\u5929\u6587\u53f0\u8868\u6240\u878d\u5408\u7684\u72ec\u7279\u9b45\u529b\uff0c\u4ee4\u5176\u6210\u4e3a\u5de5\u4f5c\u751f\u6d3b\uff0c\u65f6\u5c1a\u642d\u914d\u4e2d\u7684\u81f3\u81fb\u4e4b\u9009\u3002\u8bf7\u70b9\u51fb\u94fe\u63a5\uff0c\u4e86\u89e3\u66f4\u591a\u4ea7\u54c1\u4fe1\u606f\u3002\uff08\u6ce8\uff1a\u6b64\u6587\u5c5e\u4e8e\u767b\u8f7d\u7684\u5546\u4e1a\u4fe1\u606f\uff0c\u6587\u7ae0\u5185\u5bb9\u4e0d\u4ee3\u8868\u672c\u7f51\u89c2\u70b9\uff0c\u4ec5\u4f9b\u53c2\u8003\uff09"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/tencent/20180120/004124.json: -------------------------------------------------------------------------------- 1 | {"source": "tencent", "newsId": "004124", "comments": {"link": "http://coral.qq.com/2369229201"}, "date": "20180120", "contents": {"link": "https://news.qq.com/a/20180120/004124.htm", "title": ["\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff018\u7701\u6709\u6d53\u96fe \u5c40\u5730\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73"], "passage": "\u4e2d\u65b0\u7f511\u670820\u65e5\u7535 \u636e\u4e2d\u592e\u6c14\u8c61\u53f0\u7f51\u7ad9\u6d88\u606f\uff0c\u4e2d\u592e\u6c14\u8c61\u53f01\u670820\u65e506\u65f6\u7ee7\u7eed\u53d1\u5e03\u5927\u96fe\u9ec4\u8272\u9884\u8b66\uff0c\u9884\u8ba1\uff0c20\u65e5\u65e9\u6668\u81f3\u4e0a\u5348\uff0c\u5c71\u4e1c\u5317\u90e8\u548c\u5357\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u4e1c\u90e8\u3001\u6c5f\u82cf\u5927\u90e8\u3001\u5b89\u5fbd\u5927\u90e8\u3001\u6d59\u6c5f\u5317\u90e8\u3001\u91cd\u5e86\u4e2d\u90e8\u3001\u8d35\u5dde\u5317\u90e8\u548c\u4e2d\u90e8\u7b49\u5730\u6709\u5927\u96fe\uff0c\u5176\u4e2d\u6c5f\u82cf\u4e2d\u5317\u90e8\u3001\u5b89\u5fbd\u4e2d\u5317\u90e8\u3001\u6cb3\u5357\u4e1c\u90e8\u3001\u6e56\u5317\u4e2d\u90e8\u7b49\u5730\u7684\u90e8\u5206\u5730\u533a\u6709\u80fd\u89c1\u5ea6\u4f4e\u4e8e500\u7c73\u7684\u6d53\u96fe\uff0c\u5c40\u5730\u6709\u80fd\u89c1\u5ea6\u4e0d\u8db350\u7c73\u7684\u7279\u5f3a\u6d53\u96fe\u3002\u6b64\u5916\uff0c20\u81f321\u65e5\u6cb3\u5317\u5357\u90e8\u3001\u6cb3\u5357\u3001\u5c71\u4e1c\u4e2d\u897f\u90e8\u3001\u6c5f\u82cf\u3001\u5b89\u5fbd\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7ef4\u6301\uff0c\u5176\u4e2d21\u65e5\u53d7\u504f\u4e1c\u8def\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u6cb3\u5317\u4e1c\u90e8\u7b49\u5730\u91cd\u6c61\u67d3\u5929\u6c14\u7565\u6709\u51cf\u5f31\u300222\u65e5\u591c\u95f4\u8d77\uff0c\u53d7\u8f83\u5f3a\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u533a\u57df\u91cd\u6c61\u67d3\u5929\u6c14\u81ea\u5317\u5411\u5357\u9010\u6e10\u51cf\u5f31\u6d88\u6563\u300220\u65e5\uff0c\u53d7\u51b7\u7a7a\u6c14\u5f71\u54cd\uff0c\u5185\u8499\u53e4\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u7b49\u5730\u6709\u5927\u98ce\u964d\u6e29\u5929\u6c14\uff0c\u964d\u6e29\u5e45\u5ea6\u57284~6\u2103\uff0c\u5c40\u5730\u53ef\u8fbe8\u2103\u4ee5\u4e0a\uff0c\u5e76\u4f34\u67094~6\u7ea7\u98ce\u300222\u65e5\u8d77\uff0c\u65b0\u4e00\u80a1\u51b7\u7a7a\u6c14\u5c06\u5f71\u54cd\u6211\u56fd\u4e2d\u4e1c\u90e8\u5730\u533a\uff0c\u957f\u6c5f\u4e2d\u4e0b\u6e38\u53ca\u5176\u4ee5\u5317\u5730\u533a\u67094~6\u7ea7\u504f\u5317\u98ce\uff0c\u4e2d\u4e1c\u90e8\u5927\u90e8\u5730\u533a\u6c14\u6e29\u5c06\u4e0b\u964d4~8\u2103\uff0c\u5185\u8499\u53e4\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u90e8\u5c40\u5730\u964d\u6e2910\u2103\u4ee5\u4e0a\u3002\u672a\u6765\u4e09\u5929\u9884\u62a5\u65b9\u9762\uff0c20\u65e508\u65f6\u81f321\u65e508\u65f6\uff0c\u65b0\u7586\u4f0a\u7281\u6cb3\u8c37\u548c\u5929\u5c71\u5730\u533a\u3001\u7518\u8083\u897f\u90e8\u3001\u6cb3\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u5730\u533a\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96ea\u6216\u9635\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6c5f\u6c49\u5357\u90e8\u3001\u6c5f\u5357\u897f\u90e8\u548c\u5317\u90e8\u3001\u534e\u5357\u897f\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u897f\u90e8\u3001\u9ed1\u9f99\u6c5f\u5317\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u300221\u65e508\u65f6\u81f322\u65e508\u65f6\uff0c\u5357\u65b9\u964d\u6c34\u8303\u56f4\u6269\u5927\u3002\u534e\u5317\u5317\u90e8\u3001\u5185\u8499\u53e4\u4e2d\u90e8\u504f\u5357\u3001\u5c71\u4e1c\u4e1c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff0c\u5176\u4e2d\uff0c\u6cb3\u5317\u5317\u90e8\u3001\u5c71\u4e1c\u534a\u5c9b\u5c40\u5730\u6709\u4e2d\u96ea\uff1b\u6c5f\u6dee\u4e1c\u90e8\u548c\u5357\u90e8\u3001\u6e56\u5317\u5357\u90e8\u3001\u6c5f\u5357\u3001\u897f\u5357\u5730\u533a\u4e1c\u5357\u90e8\u3001\u534e\u5357\u897f\u90e8\u548c\u5317\u90e8\u7b49\u5730\u6709\u5c0f\u5230\u4e2d\u96e8\u3002\u5185\u8499\u53e4\u4e2d\u897f\u90e8\u3001\u8fbd\u4e1c\u534a\u5c9b\u3001\u5c71\u4e1c\u534a\u5c9b\u7b49\u5730\u67094~6\u7ea7\u98ce\u300222\u65e508\u65f6\u81f323\u65e508\u65f6\uff0c\u6cb3\u5317\u4e2d\u90e8\u3001\u5c71\u4e1c\u5317\u90e8\u3001\u4e1c\u5317\u5730\u533a\u4e1c\u5357\u90e8\u7b49\u5730\u6709\u5c0f\u96ea\u6216\u96e8\u5939\u96ea\uff1b\u897f\u5357\u5730\u533a\u4e1c\u90e8\u3001\u6e56\u5317\u897f\u90e8\u3001\u6e56\u5357\u897f\u90e8\u548c\u5357\u90e8\u3001\u5e7f\u897f\u7b49\u5730\u6709\u5c0f\u96e8\u3002\u5185\u8499\u53e4\u5927\u90e8\u3001\u534e\u5317\u3001\u8fbd\u5b81\u3001\u9ec4\u6dee\u4e1c\u90e8\u7b49\u5730\u67094~6\u7ea7\u98ce\u3002\u4e1c\u6d77\u5927\u90e8\u3001\u5357\u6d77\u4e1c\u5317\u90e8\u5c06\u67096~8\u7ea7\u3001\u9635\u98ce9\u7ea7\u5927\u98ce\u3002\u9632\u5fa1\u6307\u5357\uff1a1\u3001\u7531\u4e8e\u80fd\u89c1\u5ea6\u8f83\u4f4e\uff0c\u9a7e\u9a76\u4eba\u5458\u5e94\u63a7\u5236\u901f\u5ea6\uff0c\u786e\u4fdd\u5b89\u5168\uff1b2\u3001\u673a\u573a\u3001\u9ad8\u901f\u516c\u8def\u3001\u8f6e\u6e21\u7801\u5934\u91c7\u53d6\u63aa\u65bd\uff0c\u4fdd\u4ea4\u901a\u5b89\u5168\u3002"}} -------------------------------------------------------------------------------- /crawl/news/news_crawl/docs/netease/20180119/D8GOCKJU0001899N.json: -------------------------------------------------------------------------------- 1 | {"newsId": "D8GOCKJU0001899N", "date": "20180119", "source": "netease", "comments": {"link": "http://comment.news.163.com/news2_bbs/D8GOCKJU0001899N.html"}, "contents": {"title": ["\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d"], "link": "http://news.163.com/18/0119/10/D8GOCKJU0001899N.html", "passage": "\n \uff08\u539f\u6807\u9898\uff1a\u5168\u9762\u4e24\u5b69\u7b2c\u4e8c\u5e74\uff1a\u51fa\u751f\u4eba\u53e3\u603b\u91cf\u548c\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\uff09\n
2017\u5e74\u5168\u56fd\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u548c\u4eba\u53e3\u51fa\u751f\u7387\u53cc\u53cc\u4e0b\u964d\u3002\u56fd\u5bb6\u7edf\u8ba1\u5c4018\u65e5\u516c\u5e03\u6570\u636e\u663e\u793a\uff0c2017\u5e74\u5168\u5e74\u5171\u51fa\u751f\u4eba\u53e31723\u4e07\u4eba\uff0c\u6bd42016\u5e74\u51cf\u5c1163\u4e07\u4eba\u3002\u540c\u65f6\u8001\u9f84\u5316\u7a0b\u5ea6\u7ee7\u7eed\u52a0\u5927\uff0c60\u5c81\u4ee5\u4e0a\u53ca65\u5c81\u4ee5\u4e0a\u8001\u4eba\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u90fd\u6709\u660e\u663e\u4e0a\u5347\u3002
\u51fa\u751f\u4eba\u6570\u51cf\u5c11
\u53bb\u5e74\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\u5b9e\u65bd\u7684\u7b2c\u4e8c\u5e74\u3002\u6839\u636e\u6b64\u524d\u6709\u5173\u65b9\u9762\u7684\u5224\u65ad\uff0c\u5168\u9762\u4e24\u5b69\u7684\u653f\u7b56\u6548\u679c\u4f53\u73b0\u6709\u6ede\u540e\u6027\uff0c\u5e94\u8be5\u57282017\u5e74\u4e4b\u540e\u9010\u6b65\u663e\u73b0\uff0c\u56e0\u6b642017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u4f1a\u660e\u663e\u9ad8\u4e8e2016\u5e74\u3002\u4f46\u4ece\u56fd\u5bb6\u7edf\u8ba1\u5c40\u516c\u5e03\u7684\u6570\u636e\u6765\u770b\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6bd42016\u5e74\u76841786\u4e07\u4eba\u51cf\u5c11\u4e8663\u4e07\u4eba\u3002
\u4eba\u53e3\u51fa\u751f\u7387\u4e5f\u540c\u6837\u51fa\u73b0\u4e86\u4e0b\u964d\u3002\u53bb\u5e74\u5168\u56fd\u4eba\u53e3\u51fa\u751f\u7387\u4e3a12.43\u2030\uff0c2016\u5e74\u8fd9\u4e00\u6570\u636e\u4e3a12.95\u2030\u3002
\u4e2d\u56fd\u793e\u4f1a\u79d1\u5b66\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u4eba\u53e3\u7edf\u8ba1\u5ba4\u4e3b\u4efb\u738b\u5e7f\u5dde\u8868\u793a\uff0c2017\u5e74\u51fa\u751f\u4eba\u53e3\u6570\u91cf\u6bd42016\u5e74\u8fd8\u8981\u5c11\uff0c\u8fd9\u4e3b\u8981\u662f\u56e0\u4e3a\u4e00\u5b69\u51fa\u751f\u6570\u91cf\u4e0b\u964d\u5e45\u5ea6\u5f88\u5927\uff0c\u5982\u679c\u4e0d\u662f\u5168\u9762\u4e24\u5b69\u653f\u7b56\uff0c\u51fa\u751f\u89c4\u6a21\u4e0b\u964d\u5e45\u5ea6\u4f1a\u66f4\u5927\u3002
\u957f\u671f\u5173\u6ce8\u751f\u80b2\u610f\u613f\u4e0e\u751f\u80b2\u884c\u4e3a\u7814\u7a76\u7684\u793e\u79d1\u9662\u4eba\u53e3\u4e0e\u52b3\u52a8\u7ecf\u6d4e\u7814\u7a76\u6240\u7814\u7a76\u5458\u90d1\u771f\u771f\u8868\u793a\uff0c2017\u5e74\u51fa\u73b0\u51fa\u751f\u4eba\u53e3\u7684\u4e0b\u964d\u8bf4\u660e\uff0c\u4e2a\u4eba\u751f\u80b2\u610f\u613f\u548c\u751f\u80b2\u884c\u4e3a\u53d7\u5230\u5f88\u591a\u590d\u6742\u56e0\u7d20\u7684\u5f71\u54cd\uff0c\u5305\u62ec\u7ecf\u6d4e\u80fd\u529b\u3001\u5e74\u9f84\u3001\u751f\u80b2\u504f\u597d\u7b49\u7b49\uff0c\u653f\u7b56\u5bf9\u751f\u80b2\u884c\u4e3a\u7684\u5f71\u54cd\u5e76\u6ca1\u6709\u539f\u6765\u9884\u60f3\u5f97\u5927\u3002
\u8001\u9f84\u5316\u52a0\u901f
\u6839\u636e\u56fd\u5bb6\u7edf\u8ba1\u5c40\u6570\u636e\u663e\u793a\uff0c\u4e2d\u56fd\u4eba\u53e3\u7684\u8001\u9f84\u5316\u7a0b\u5ea6\u6b63\u5728\u52a0\u901f\u52a0\u6df1\u30022017\u5e74\uff0c\u5168\u56fd\u4eba\u53e3\u4e2d60\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e324090\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768417.3%\uff0c\u5176\u4e2d65\u5468\u5c81\u53ca\u4ee5\u4e0a\u4eba\u53e315831\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u768411.4%\u300260\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u548c65\u5468\u5c81\u4ee5\u4e0a\u4eba\u53e3\u90fd\u6bd4\u4e0a\u5e74\u589e\u52a0\u4e860.6\u4e2a\u767e\u5206\u70b9\u3002
\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd\u6301\u7eed\u964d\u4f4e\uff0c\u53bb\u5e7416\u81f359\u5468\u5c81\u7684\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u4e3a90199\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a64.9%\u30022016\u5e74\uff0c\u5168\u56fd\u52b3\u52a8\u5e74\u9f84\u4eba\u53e3\u6570\u91cf\u4e3a90747\u4e07\u4eba\uff0c\u5360\u603b\u4eba\u53e3\u7684\u6bd4\u91cd\u4e3a65.6%\u3002
\u7edf\u8ba1\u663e\u793a\uff0c\u53bb\u5e74\u4e2d\u56fd\u7684\u57ce\u9547\u5316\u901f\u5ea6\u5728\u6301\u7eed\u63d0\u9ad8\u3002\u57ce\u9547\u5e38\u4f4f\u4eba\u53e381347\u4e07\u4eba\uff0c\u6bd4\u4e0a\u5e74\u672b\u589e\u52a02049\u4e07\u4eba;\u4e61\u6751\u5e38\u4f4f\u4eba\u53e357661\u4e07\u4eba\uff0c\u51cf\u5c111312\u4e07\u4eba;\u57ce\u9547\u4eba\u53e3\u5360\u603b\u4eba\u53e3\u6bd4\u91cd(\u57ce\u9547\u5316\u7387)\u4e3a58.52%\uff0c\u6bd4\u4e0a\u5e74\u672b\u63d0\u9ad81.17\u4e2a\u767e\u5206\u70b9\u3002
"}, "cmtId": "D8GOCKJU0001899N"} -------------------------------------------------------------------------------- /crawl/news/news_crawl/crawl/spiders/newsspider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import scrapy 5 | import re 6 | from scrapy.selector import Selector 7 | from crawl.items import NeteaseItem,TencentItem,SinaItem 8 | from scrapy.http import Request 9 | from urllib.request import urlopen 10 | from crawl.maziclib.news_fun import ListCombiner 11 | 12 | 13 | class NeteaseNewsSpider(scrapy.Spider): 14 | name = 'netease_news_spider' #最后要调用的名字 15 | start_urls = ['http://news.163.com'] 16 | allowed_domains = ['news.163.com'] 17 | 18 | url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/(\d+)/(\w+)\.html' 19 | 20 | def parse(self, response): # response即网页数据 21 | pat = re.compile(self.url_pattern) 22 | next_urls = re.findall(pat, str(response.body)) 23 | 24 | ###debug 25 | #article = next_urls[0][0]+'/'+next_urls[0][1]+'/'+next_urls[0][2]+'/'+next_urls[0][3]+'/'+next_urls[0][4]+'.html' 26 | #yield Request(article, callback=self.parse_news) 27 | ###debug 28 | 29 | for next_url in next_urls: 30 | article = next_url[0]+'/'+next_url[1]+'/'+next_url[2]+'/'+next_url[3]+'/'+next_url[4]+'.html' 31 | yield Request(article,callback=self.parse_news) 32 | 33 | def parse_news(self, response): 34 | item = NeteaseItem() 35 | selector = Selector(response) 36 | pattern = re.match(self.url_pattern, response.url) 37 | 38 | 39 | source = 'netease' 40 | date = '20'+pattern.group(2)+pattern.group(3) 41 | newsId = pattern.group(5) 42 | cmtId = pattern.group(5) 43 | 44 | productKey = re.findall(re.compile(r'"productKey" : "(\w+)"'), str(response.body))[0] 45 | comments_api = 'http://comment.news.163.com/api/v1/products/' + productKey + '/threads/' + newsId 46 | boardId = re.findall(r'"boardId":"(\w+)"',str(urlopen(comments_api).read()))[0] 47 | comments = ('http://comment.news.163.com/'+boardId+'/'+newsId+'.html') 48 | 49 | item['source'] = 'netease' 50 | item['date'] = date 51 | item['newsId'] = newsId 52 | item['cmtId'] = cmtId 53 | #item['boardId'] = boardId 54 | item['comments'] = {'link' : comments} 55 | item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''} 56 | item['contents']['title'] = selector.xpath('//*[@id="epContentLeft"]/h1/text()').extract() 57 | item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="endText"]/p').extract()) 58 | yield item 59 | 60 | 61 | 62 | 63 | class TencentNewsSpider(scrapy.Spider): 64 | name = 'tencent_news_spider' #最后要调用的名字 65 | start_urls = ['http://news.qq.com'] 66 | allowed_domains = ['news.qq.com'] 67 | 68 | #https://news.qq.com/a/20180120/000738.htm 69 | url_pattern = r'http://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm' 70 | 71 | def parse(self, response): # response即网页数据 72 | pat = re.compile(self.url_pattern) 73 | next_urls = re.findall(pat, str(response.body)) 74 | 75 | ### debug 76 | #article = 'http://'+next_urls[0][0]+'.qq.com/a/'+next_urls[0][1]+'/'+next_urls[0][2]+'.htm' 77 | #print(article) 78 | #yield Request(article,callback=self.parse_news) 79 | ### debug 80 | 81 | for next_url in next_urls: 82 | article = 'http://'+next_url[0]+'.qq.com/a/'+next_url[1]+'/'+next_url[2]+'.htm' 83 | yield Request(article,callback=self.parse_news) 84 | 85 | 86 | def parse_news(self, response): 87 | item = TencentItem() 88 | selector = Selector(response) 89 | url_pattern2 = r'(\w+)://(\w+)\.qq\.com/a/(\d{8})/(\d+)\.htm' 90 | pattern = re.match(url_pattern2, str(response.url)) 91 | 92 | source = 'tencent' 93 | date = pattern.group(3) 94 | newsId = pattern.group(4) 95 | cmtId = re.findall(re.compile(r'cmt_id = (\d+);'), str(response.body))[0] 96 | comments = 'http://coral.qq.com/' + cmtId 97 | 98 | 99 | item['source'] = source 100 | item['date'] = date 101 | item['newsId'] = newsId 102 | item['comments'] = {'link' : comments} 103 | item['contents'] = {'link' : str(response.url), 'title' : u'', 'passage' : u''} 104 | item['contents']['title'] = selector.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract() 105 | item['contents']['passage'] = ListCombiner(selector.xpath('//*[@id="Cnt-Main-Article-QQ"]/p/text()').extract()) #这里要不要留下那些