├── 02_pdfmerger.py
├── LICENSE
├── .gitignore
├── tomysql.js
├── README.md
├── 04_juejinxiaoce.py
├── 01_downloader.py
├── geek.js
├── 03_geeksplider.py
└── 05_weibosplider.py


/02_pdfmerger.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 麦晓杰 lavna
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/tomysql.js:
--------------------------------------------------------------------------------
 1 | const path = require('path')
 2 | const fs = require('fs')
 3 | const mysql = require('mysql')
 4 | 
 5 | // 数据库配置
 6 | const dbconfig = {
 7 |     host: 'your mysql host',
 8 |     port: 'your mysql port',
 9 |     user: 'your mysql username',
10 |     password: 'your mysql password',
11 |     database: 'your mysql database'
12 | }
13 | const pool = mysql.createPool(dbconfig);
14 | 
15 | // 读取json文件
16 | const file = path.join(__dirname, './articles154.json')
17 | // 数据表名
18 | const tableName = 'article'
19 | fs.readFile(file, 'utf-8', function(err, data) {
20 |     const json = JSON.parse(data)
21 |     pool.getConnection(function(err, connection) {
22 |     	// sql  下面的逻辑可以根据自己的需要去写
23 |         var sql = `INSERT INTO article (article_title, audio_time, audio_size, pid, audio_url, audio_download_url, mdhtml, ctime, id, article_cover) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`;
24 |         for (let i = 0; i < json.length; i++) {
25 |             console.log(json[i].article_title)
26 |             var values = [
27 |                 json[i].article_title,
28 |                 json[i].audio_time,
29 |                 json[i].audio_size,
30 |                 json[i].pid,
31 |                 json[i].audio_url,
32 |                 json[i].audio_download_url,
33 |                 json[i].article_content,
34 |                 json[i].ctime,
35 |                 json[i].id,
36 |                 json[i].article_cover
37 |             ]
38 |             connection.query(sql, values, function(error, results, fields) {
39 |                 // When done with the connection, release it.
40 |                 // Handle error after the release.
41 |                 if (error) throw error;
42 |                 // Don't use the connection here, it has been returned to the pool.
43 |             });
44 |         }
45 |         pool.end();
46 |     })
47 | })


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | # json2mysql
  5 | 
  6 | 这次更新了将一个json文件中的数据导入到mysql的脚本。 是用nodejs写的。
  7 | 
  8 | 对应的文件是tomysql.js 有兴趣的同志可以研究下。
  9 | 
 10 | 
 11 | ---
 12 | 
 13 | # pythonCollection（python代码集合）
 14 | 
 15 | **本项目目前仅在python2.7下实验，其他版本暂不维护，请灵活使用**
 16 | 
 17 | - 文件下载器
 18 | - pdf合并工具
 19 | - 极客专栏文章下载
 20 | - 掘金小册下载
 21 | - 新浪微博爬虫
 22 | 
 23 | ## 文件下载器（downloader.py）
 24 | 
 25 | >从数据库或者文件中读取文件url地址，并且下载到本地
 26 | 
 27 | 依赖库：
 28 | 
 29 | - requests
 30 | - progressbar
 31 | - MySQLdb（如果不从数据库读取数据则不需要）
 32 | 
 33 | 
 34 | ## pdf合并工具（pdfmerger.py）
 35 | 
 36 | >合并pdf文件
 37 | 
 38 | 代码正在整理中..
 39 | 
 40 | ## 极客专栏文章下载（geeksplider.py）
 41 | 
 42 | >将你购买的极客专栏，下载为json文件
 43 | 
 44 | **2019-5-30 更新**
 45 | 
 46 | 近期geek网站加了一个限流的限制，以及请求时间戳的一些验证。
 47 | 
 48 | 对应的python文件就没更新。加完代理池什么的应该就可以用了。
 49 | 
 50 | 这次采用了一个新的套路：在浏览器控制台里执行js脚本，发送请求，再将数据存为json文件进行下载。
 51 | 
 52 | 对应的js文件是geek.js  有兴趣的同学可以研究一下哈，个人感觉还是挺有意思的。
 53 | 
 54 | --end
 55 | ---
 56 | 
 57 | 依赖库：
 58 | 
 59 | - requests
 60 | 
 61 | 文件格式为：
 62 | ```
 63 | {
 64 |     "article_title": "", 
 65 |     "audio_time": "", 
 66 |     "ctime": 1521993600, 
 67 |     "audio_size": 4375851, 
 68 |     "pid": 76, 
 69 |     "audio_url": "", 
 70 |     "mdhtml": "", 
 71 |     "audio_download_url": "", 
 72 |     "id": 4969, 
 73 |     "article_cover": ""
 74 | }
 75 | ``` 
 76 | 
 77 | 可以根据自己的需求灵活改动代码进行使用.
 78 | 
 79 | ## 掘金小册下载（juejinxiaoce.py）
 80 | 
 81 | >将你购买的掘金小册，下载为json格式
 82 | 
 83 | 依赖库：
 84 | 
 85 | - requests
 86 | 
 87 | 文件格式为：
 88 | 
 89 | ```
 90 | {
 91 |     "article_title": "", 
 92 |     "pid": "", 
 93 |     "mdhtml": "", 
 94 |     "mdtext": "", 
 95 |     "id": "", 
 96 |     "createdAt": ""
 97 | }
 98 | ```
 99 | 
100 | ## 新浪微博爬虫（weibosplider.py）
101 | 
102 | >爬取某个人的新浪微博数据
103 | 
104 | 依赖库：
105 | 
106 | - requests
107 | - MySQLdb(如果不保存到数据库则不需要)
108 | - logging
109 | 
110 | 本脚本将数据直接存储到数据库，你也可以修改部分逻辑，存储到其他地方。
111 | 
112 | **灵活使用，代码仅供参考**
113 | 
114 | # issue
115 | 
116 | 使用中遇到问题，可以在该项目下提issue [issues入口](https://github.com/maixiaojie/pythonCollection/issues)
117 | 
118 | 
119 | # PR
120 | 
121 | 如果你也有好用的小工具要分享，请new pull request
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/04_juejinxiaoce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-01-16 15:43:36
 4 | # @Author  : maixiaojie (tracywyj@gmail.com)
 5 | # @Link    : https://yk.mcust.cn
 6 | # @Version : $Id$
 7 | import requests
 8 | import json
 9 | import codecs
10 | 
11 | cookies = ""
12 | bookid = ""
13 | token = ""
14 | uid = ""
15 | client_id = ""
16 | 
17 | class Tools(object):
18 | 	def __init__(self, cookies, bookid, token, uid, client_id):
19 | 		super(Tools, self).__init__()
20 | 		self.cookies = cookies
21 | 		self.token = token
22 | 		self.bookid = bookid
23 | 		self.uid = uid
24 | 		self.client_id = client_id
25 | 		self.headers = {
26 | 			'Cookie': self.cookies,
27 | 			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 DID:3441301122:DID',
28 | 			'Referer': 'https://juejin.im', 
29 | 			'Origin': 'https://juejin.im',
30 | 			'Host': 'xiaoce-cache-api-ms.juejin.im',
31 | 			'Content-Type': 'application/json'
32 | 		}
33 | 	def getall(self):
34 | 		headers = self.headers
35 | 		url = "https://xiaoce-cache-api-ms.juejin.im/v1/get"
36 | 		payload = {'uid': '', 'client_id': '1548407371349', 'token': self.token, 'src': 'web', 'id': self.bookid}
37 | 		r = requests.get(url, params=payload, headers=headers)
38 | 		res = r.json()
39 | 		code = res.get('m')
40 | 		if(code == 'ok'):
41 | 			section = res.get('d').get('section')
42 | 			return section
43 | 		else:
44 | 			return []
45 | 
46 | 	# 获取
47 | 	def get_article_detail(self, sectionid):
48 | 		headers = self.headers
49 | 		url = 'https://xiaoce-cache-api-ms.juejin.im/v1/getSection'
50 | 		payload = {'uid': self.uid, 'client_id': self.client_id, 'token': self.token, 'src': 'web', 'sectionId': sectionid}
51 | 		r = response = requests.get(url, params=payload, headers=headers)
52 | 		res = r.json()
53 | 		print res
54 | 		print r.url
55 | 		code = res.get('m')
56 | 		if(code == 'ok'):
57 | 			data = res.get('d')
58 | 			section_dict = {}
59 | 			section_dict['id'] = data.get('id')
60 | 			section_dict['pid'] = data.get('metaId')
61 | 			section_dict['article_title'] = data.get('title')
62 | 			section_dict['mdhtml'] = data.get('html')
63 | 			section_dict['mdtext'] = data.get('content')
64 | 			section_dict['createdAt'] = data.get('createdAt')
65 | 			with codecs.open('book-'+str(self.bookid)+".json", "a+", "utf-8") as fp:
66 | 				fp.write(json.dumps(section_dict,indent=4, ensure_ascii=False)+',\r\n')
67 | 
68 | 	def main(self):
69 | 		ids = self.getall()
70 | 		if len(ids) > 0:
71 | 			for id in ids:
72 | 				self.get_article_detail(id)
73 | 
74 | tool = Tools(cookies, bookid, token, uid, client_id)
75 | tool.main()
76 | # tool.get_article_detail('5bdd0d83f265da615f76ba57')
77 | 
78 | 


--------------------------------------------------------------------------------
/01_downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-02-19 15:39:03 元宵节
 4 | # @Author  : maixiaojie (tracywyj@gmail.com)
 5 | # @Link    : https://maixiaojie.github.io
 6 | # @Version : 1.0.0
 7 | # 
 8 | ############文件下载器 python2.7########################
 9 | # 
10 | # 本项目仅在python2.7下实验，下载8000+个视频，其他版本请灵活改动即可
11 | # 本项目不维护其他版本python
12 | # 
13 | # 需要安装requests、progressbar、MySQLdb三个库
14 | # 
15 | # `pip install requests`
16 | # `pip install progressbar`
17 | # MySQLdb -- 如果不需要读取数据库，则不需要该库
18 | 
19 | import requests
20 | import urllib
21 | import os
22 | import time
23 | import progressbar
24 | import MySQLdb
25 | 
26 | # 根据url获取文件名，需求不同，下面的表达式也不同，这块灵活改动
27 | def getName(path):
28 | 	return path.split('/')[-1]
29 | 
30 | #下载文件
31 | def download(url, dirpath):	
32 | 	# 配置你的文件请求相关的headers即可，这块灵活改动
33 | 	headers = {
34 | 		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
35 | 		"Referer": "http://www.maiziedu.com/"
36 | 	}
37 | 	filename = getName(url)
38 | 	path = dirpath + '/' + filename
39 | 	r = requests.get(url, headers=headers, stream=True)
40 | 	chunk_size = 1024
41 | 	content_size = int(r.headers['content-length'])
42 | 	if r.status_code == 200:		
43 | 		print '[File Total Size]: %0.2f Mb' % (content_size /chunk_size / 1024)
44 | 		print '[File Name]: ' + filename
45 | 		with open(path, "wb") as file:
46 | 			widgets = ['Progress: ', progressbar.Percentage(), ' ',progressbar.Bar(marker='#', left='[', right=']'),' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
47 | 			pbar = progressbar.ProgressBar(widgets=widgets, maxval=content_size).start()
48 | 			for data in r.iter_content(chunk_size=chunk_size):
49 | 				if data:
50 | 					file.write(data)
51 | 					file.flush()
52 | 				pbar.update(len(data) + chunk_size)
53 | 			pbar.finish()
54 | 
55 | def main():
56 | 	# 下载的所有文件都放在该文件夹下，名称可自由修改，合法即可
57 | 	# 会先判断是否存在该文件夹，若不存在会创建文件夹
58 | 	path = "./video" 
59 | 	isExists = os.path.exists(path)
60 | 	if not isExists:
61 | 		os.makedirs(path)
62 | 
63 | 	# 我这里的文件地址是从数据库中读取的
64 | 	# 你也可以灵活地改成从文件读取等其他方式
65 | 	db = MySQLdb.connect('ip:port', 'user', 'password', 'database',  charset='utf8')
66 | 	cursor = db.cursor()
67 | 	cursor.execute("SELECT video_url from lesson_detail limit 100 offset 0 ")
68 | 	rs = cursor.fetchall()
69 | 	cursor.close()
70 | 	db.close()
71 | 
72 | 	# 遍历url集合，如果对应的文件本地已经存在，则跳过该文件下载
73 | 	for i, url in enumerate(rs):
74 | 		fileurl = ''.join(url)
75 | 		filename = getName(fileurl)
76 | 		filepath = path+'/'+filename
77 | 		isFileExists = os.path.exists(filepath)
78 | 		if isFileExists:
79 | 			print '[msg]: ' + filename + ' has been downloaded...'			
80 | 		else:
81 | 			print 'downloading ['+ str(i) +' ]' + fileurl
82 | 			download(fileurl, path)
83 | 			
84 | 	
85 | main()


--------------------------------------------------------------------------------
/geek.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * 运行在浏览器控制台中
 3 |  * @type {Array}
 4 |  */
 5 | 
 6 | // 要爬取的文章id集合  geek网站做了限流，所以分两次请求。当然你也可以用代理什么的
 7 | var ids = [77345, 77749, 77804, 78158, 78168, 78884, 79319, 79539, 80011, 80021, 80042, 80240, 80260, 80311, 81730, 82397, 82711, 82764, 83302, 83719, 83860, 84365, 84633, 85031, 85341, 85745, 86117];
 8 | var ids2 = [86400, 86823, 87179, 87234, 87808, 88275, 88538, 88827, 89151, 89491, 89832, 90148, 90485, 90998, 91325, 91644, 92227, 92663, 93110, 93216, 93289, 93777, 94156, 94644, 94979, 95469, 95833, 96269, 96809, 97144];
 9 | // 最后获得的数据
10 | var rs = []
11 | /**
12 |  * 初始化   文档中注入FileSaver.js 并执行请求开始的方法
13 |  * @return {[type]} [description]
14 |  */
15 | function init() {
16 |     var src = 'https://cdn.bootcss.com/FileSaver.js/2014-11-29/FileSaver.js';
17 |     var script = document.createElement('script');
18 |     script.src = src;
19 |     var heads = document.getElementsByTagName("head");
20 |     if (heads.length)
21 |         heads[0].appendChild(script);
22 |     else
23 |         document.documentElement.appendChild(script);
24 |     script.onload = function() {
25 |         console.log('script loaded')
26 |         start()
27 |     }
28 | }
29 | // 将数据转存为json文件
30 | function downloadJson(data) {
31 |     var blob = new Blob([JSON.stringify(data)], { type: "" });
32 |     saveAs(blob, "data.json");
33 | }
34 | 
35 | // 爬取一篇文章 就是一个ajax请求
36 | function fetch(id) {
37 |     var data = JSON.stringify({
38 |         "include_neighbors": "false",
39 |         "id": id
40 |     });
41 |     var xhr = new XMLHttpRequest();
42 |     xhr.withCredentials = true;
43 |     xhr.addEventListener("readystatechange", function() {
44 |         if (this.readyState === 4) {
45 |             var res = JSON.parse(this.responseText);
46 |             if (res.code == 0) {
47 |                 var data = res.data;
48 |                 var item = {
49 |                     id: data.id,
50 |                     pid: data.cid,
51 |                     article_content: data.article_content,
52 |                     article_cover: data.article_cover,
53 |                     article_ctime: data.article_ctime,
54 |                     article_title: data.article_title,
55 |                     audio_download_url: data.audio_download_url,
56 |                     audio_size: data.audio_size,
57 |                     audio_time: data.audio_time,
58 |                     audio_url: data.audio_url
59 |                 }
60 |                 rs.push(item);
61 |                 // 如果当前是最后一个就下载文件
62 |                 if (id == ids2[ids2.length - 1]) {
63 |                     downloadJson(rs)
64 |                 }
65 |             }
66 |         }
67 |     });
68 | 
69 |     xhr.open("POST", "https://time.geekbang.org/serv/v1/article");
70 |     xhr.setRequestHeader("content-type", "application/json");
71 |     xhr.send(data);
72 | }
73 | 
74 | function start() {
75 |     // 依次请求
76 |     for (var i = 0; i < ids2.length; i++) {
77 |         (function(i) {
78 |             setInterval(fetch(ids2[i]), 3000)
79 |         })(i)
80 |     }
81 | }
82 | init()
83 | 


--------------------------------------------------------------------------------
/03_geeksplider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # @Date    : 2019-01-16 15:43:36
 4 | # @Author  : maixiaojie (tracywyj@gmail.com)
 5 | # @Link    : https://yk.mcust.cn
 6 | # @Version : $Id$
 7 | import requests
 8 | import json
 9 | import codecs
10 | 
11 | #  ****************************************************
12 | #  *python环境，目前2.7版本没问题，3版本暂未测试
13 | #  *依赖库： requests  `pip install requests`	
14 | #  *最后会生成一个json文件
15 | #  ****************************************************
16 | #   
17 | # 极客时间官网，登录后--f12--Netword--任意点开该域名下的一个请求--Headers--Request Headers--Cookies
18 | # 复制所有的cookies 
19 | # https://time.geekbang.org/ 
20 | cookies = "改成你的cookies"
21 | # 专栏的id，修改成你要获取的专栏的id即可
22 | # 如https://time.geekbang.org/column/154， 该专栏的id为154
23 | id = '48'
24 | 
25 | class Tools(object):
26 | 	def __init__(self, cookies, columnid):
27 | 		super(Tools, self).__init__()
28 | 		self.cookies = cookies
29 | 		self.columnid = columnid
30 | 		self.headers = {
31 | 			'Cookie': self.cookies,
32 | 			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 DID:3441301122:DID',
33 | 			'Referer': 'https://time.geekbang.org', 
34 | 			'Origin': 'https://time.geekbang.org',
35 | 			'Host': 'time.geekbang.org',
36 | 			'Content-Type': 'application/json'
37 | 		}
38 | 	def getall(self):
39 | 		headers = self.headers
40 | 		url = 'https://time.geekbang.org/serv/v1/my/products/all'
41 | 		r = response = requests.post(url, headers=headers)
42 | 		# print r.text
43 | 	# 获取专栏的文章列表
44 | 	def get_article_list(self):
45 | 		headers = self.headers
46 | 		url = 'https://time.geekbang.org/serv/v1/column/articles'
47 | 		payload = {'cid': self.columnid, 'order': 'earliest', 'prev': 0, 'sample': 'true', 'size': 200}
48 | 		r = response = requests.post(url, data=json.dumps(payload), headers=headers)
49 | 		res = r.json()
50 | 		code = res.get('code')
51 | 		if code >= 0:
52 | 			lists = res.get('data').get('list')
53 | 			ids_list = []
54 | 			for item in lists:
55 | 				ids_list.append(item.get('id'))
56 | 			return ids_list
57 | 		else:
58 | 			return []
59 | 	# 获取单个文章的数据
60 | 	def get_article_detail(self, article_id):
61 | 		headers = self.headers
62 | 		url = 'https://time.geekbang.org/serv/v1/article'
63 | 		payload = {'id': article_id, 'include_neighbors': 'false'}
64 | 		r = response = requests.post(url, data=json.dumps(payload), headers=headers)
65 | 		res = r.json()
66 | 		code = res.get('code')
67 | 		if code >= 0:
68 | 			# print 'id为'+ str(article_id)+'的文章数据获取成功'
69 | 			lists = res.get('data')
70 | 			article_dict = {}
71 | 			article_dict['id'] = res.get('data').get('id')
72 | 			article_dict['pid'] = res.get('data').get('cid')
73 | 			article_dict['article_title'] = res.get('data').get('article_title')
74 | 			article_dict['article_cover'] = res.get('data').get('article_cover')
75 | 			article_dict['audio_download_url'] = res.get('data').get('audio_download_url')
76 | 			article_dict['audio_url'] = res.get('data').get('audio_url')
77 | 			article_dict['audio_size'] = res.get('data').get('audio_size')
78 | 			article_dict['audio_time'] = res.get('data').get('audio_time')
79 | 			article_dict['mdhtml'] = res.get('data').get('article_content')			
80 | 			article_dict['ctime'] = res.get('data').get('article_ctime')
81 | 			with codecs.open('articles'+str(self.columnid)+".json", "a+", "utf-8") as fp:
82 | 				fp.write(json.dumps(article_dict,indent=4, ensure_ascii=False)+',\r\n')
83 | 		else:
84 | 			pass
85 | 			# print '获取失败'
86 | 	def main(self):
87 | 		ids = self.get_article_list()
88 | 		if len(ids) > 0:
89 | 			for id in ids:
90 | 				self.get_article_detail(id)
91 | 
92 | 
93 | tool = Tools(cookies, id)
94 | tool.main()


--------------------------------------------------------------------------------
/05_weibosplider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Date    : 2018-11-01 09:43:36
  4 | # @Author  : maixiaojie (tracywyj@gmail.com)
  5 | # @Link    : https://maixiaojie.github.io
  6 | # @Version : $Id$
  7 | 
  8 | import os
  9 | import requests
 10 | import json
 11 | import re
 12 | import math
 13 | import MySQLdb
 14 | import time
 15 | import datetime
 16 | import codecs
 17 | import logging
 18 | 
 19 | mylogger = logging.getLogger('mylogger')
 20 | mylogger.setLevel(logging.DEBUG)
 21 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(filename)s- %(levelname)s - %(message)s')
 22 | fh = logging.FileHandler('logout.log')
 23 | fh.setLevel(logging.DEBUG)
 24 | fh.setFormatter(formatter)
 25 | 
 26 | mylogger.addHandler(fh)
 27 | 
 28 | dbWB = MySQLdb.connect('ip:port', 'user', 'password', 'database',  charset='utf8')
 29 | cursorWB = dbWB.cursor()
 30 | cursorWB.execute('SET NAMES utf8mb4')
 31 | class Tool:
 32 | 	removeImg = re.compile('<img.*?>')
 33 | 	removeAddr = re.compile('<a.*?></a>')
 34 | 	replaceLine = re.compile('<tr>|<div>|</div>|</p>')
 35 | 	removeTag = re.compile('<.*?>')
 36 | 	url_regex = re.compile('http://.*?/\w{7}') # http://t.cn/RdhOUUu
 37 | 	s_regex = re.compile('u\w{3,4}') # u200b/u3000
 38 | 	face_regex = re.compile('\[\w+\]') # 表情：[开心]
 39 | 	#self是实例方法 cls是类方法
 40 | 	@classmethod
 41 | 	def replace(cls,x):
 42 | 		x=re.sub(cls.removeImg,'',x)
 43 | 		x=re.sub(cls.removeAddr,'',x)
 44 | 		x=re.sub(cls.replaceLine,'',x)
 45 | 		x=re.sub(cls.removeTag,'',x)
 46 | 		x=re.sub(cls.url_regex,'',x)
 47 | 		# x=re.sub(cls.s_regex,'',x)
 48 | 		return x.strip() #去掉多余的空格
 49 | 		# return x
 50 | 
 51 | class Weibo(object):
 52 | 	def get_total(self,id,page):
 53 | 		url = 'https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page)
 54 | 		response = requests.get(url)
 55 | 		print url
 56 | 		ob_json = json.loads(response.text)
 57 | 		# print ob_json
 58 | 		status = ob_json.get('ok')
 59 | 		# print status
 60 | 		mylogger.info(status)
 61 | 		if status == 1:
 62 | 			totalData = ob_json.get('data').get('cardlistInfo').get('total')
 63 | 			total_page = int(math.ceil(totalData / 10)) + 1
 64 | 			mylogger.info('一共' + str(total_page) + '页微博...')
 65 | 			mylogger.info('一共' + str(totalData) + '条微博...')
 66 | 			sql = 'insert into wb_splider(`uid`, `splider_status`, `ctime`) values(%s, %s, %s)'
 67 | 			dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 68 | 			cursorWB.execute(sql, [id, '112', dt])
 69 | 			dbWB.commit()
 70 | 			# print '一共' + str(total_page) + '页微博...'
 71 | 			# print '一共' + str(totalData) + '条微博...'
 72 | 			return total_page
 73 | 		else:
 74 | 			mylogger.info(id + '的数据为空')
 75 | 			# print '数据为空'
 76 | 			# sql = 'insert into wb_splider(`uid`, `splider_status`) values(%s, %s)'
 77 | 			# cursorWB.execute(sql, [id, '3'])
 78 | 			return 0
 79 | 
 80 | 	def get_weibo(self,id,page):
 81 | 		url = 'https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page)
 82 | 		response = requests.get(url)
 83 | 		print url
 84 | 		print '正在爬取...' + str(response.url)
 85 | 		ob_json = json.loads(response.text)
 86 | 		list_cards = ob_json.get('data').get('cards')
 87 | 		mylogger.info('第' + str(page) + '页数据获取成功...')
 88 | 		# print '第' + str(page) + '页数据获取成功...'
 89 | 		return list_cards
 90 | 
 91 | 	def write2file(self, data, filename):
 92 | 		with codecs.open(filename, 'a+', 'utf-8') as f:
 93 | 			f.write(data + '\r\n')
 94 | 
 95 | 	def handle_cardlist(self, list_cards, uid):
 96 | 		if list_cards != None:
 97 | 			for card in list_cards:
 98 | 				if card.get('card_type') == 9:
 99 | 					wb_id = card.get('mblog').get('id')
100 | 					id = card.get('mblog').get('bid')
101 | 					source = card.get('mblog').get('source')
102 | 					reposts_count = card.get('mblog').get('reposts_count')
103 | 					comments_count = card.get('mblog').get('comments_count')
104 | 					attitudes_count = card.get('mblog').get('attitudes_count')
105 | 					text=card.get('mblog').get('text')
106 | 					pre_text = Tool.replace(text)
107 | 					retweeted_status = card.get('mblog').get('retweeted_status')
108 | 					created_at = card.get('mblog').get('created_at')
109 | 					# print pre_text
110 | 					if retweeted_status:
111 | 						is_retweeted = 1
112 | 					else:
113 | 						is_retweeted = 0
114 | 					sql = "insert into wb(`id`, `uid`, `wb_id`, `text`, `source`,`reposts_count`,`comments_count`,`attitudes_count`,`is_retweeted`,`created_at`, `origin_text`) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
115 | 					try:
116 | 						self.write2file(pre_text, './userdata/'+uid+'.txt')
117 | 						row_count = cursorWB.execute(sql, [id, uid, wb_id, pre_text, source, reposts_count, comments_count, attitudes_count, is_retweeted, created_at, text])
118 | 					except BaseException as t:
119 | 						print t
120 | 						print wb_id
121 | 						mylogger.error(t)
122 | 					finally:
123 | 						
124 | 						pass
125 | 
126 | 	def main(self,uid):
127 | 		total_sum = self.get_total(uid, 1)
128 | 		if total_sum > 10:
129 | 			total_sum = 10
130 | 		for i in range(total_sum):
131 | 			list_cards = self.get_weibo(uid,i+1)
132 | 			self.handle_cardlist(list_cards, uid)
133 | 		
134 | 		sql = 'insert into wb_splider(`uid`, `splider_status`, `ctime`) values(%s, %s, %s)'
135 | 		dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
136 | 		if total_sum == 0:			
137 | 			cursorWB.execute(sql, [uid, '3', dt])
138 | 			mylogger.info(uid + ' total_sum = 0')
139 | 		else:
140 | 			cursorWB.execute(sql, [uid, '2', dt])
141 | 			mylogger.info(uid + ' total_sum != 0')
142 | 		dbWB.commit()
143 | 		# cursorWB.close()
144 | 		# dbWB.close()
145 | 
146 | if __name__ == '__main__':
147 | 	weibo=Weibo()
148 | 	# 6072592521  2401890571 1992968110 5195667944
149 | 	weibo.main('5195667944')
150 | 	
151 | 	# uid = sys.argv[1]
152 | 	# if uid != '':
153 | 	# 	weibo.main(uid)


--------------------------------------------------------------------------------