├── 02_pdfmerger.py ├── LICENSE ├── .gitignore ├── tomysql.js ├── README.md ├── 04_juejinxiaoce.py ├── 01_downloader.py ├── geek.js ├── 03_geeksplider.py └── 05_weibosplider.py /02_pdfmerger.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 麦晓杰 lavna 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /tomysql.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const fs = require('fs') 3 | const mysql = require('mysql') 4 | 5 | // 数据库配置 6 | const dbconfig = { 7 | host: 'your mysql host', 8 | port: 'your mysql port', 9 | user: 'your mysql username', 10 | password: 'your mysql password', 11 | database: 'your mysql database' 12 | } 13 | const pool = mysql.createPool(dbconfig); 14 | 15 | // 读取json文件 16 | const file = path.join(__dirname, './articles154.json') 17 | // 数据表名 18 | const tableName = 'article' 19 | fs.readFile(file, 'utf-8', function(err, data) { 20 | const json = JSON.parse(data) 21 | pool.getConnection(function(err, connection) { 22 | // sql 下面的逻辑可以根据自己的需要去写 23 | var sql = `INSERT INTO article (article_title, audio_time, audio_size, pid, audio_url, audio_download_url, mdhtml, ctime, id, article_cover) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`; 24 | for (let i = 0; i < json.length; i++) { 25 | console.log(json[i].article_title) 26 | var values = [ 27 | json[i].article_title, 28 | json[i].audio_time, 29 | json[i].audio_size, 30 | json[i].pid, 31 | json[i].audio_url, 32 | json[i].audio_download_url, 33 | json[i].article_content, 34 | json[i].ctime, 35 | json[i].id, 36 | json[i].article_cover 37 | ] 38 | connection.query(sql, values, function(error, results, fields) { 39 | // When done with the connection, release it. 40 | // Handle error after the release. 41 | if (error) throw error; 42 | // Don't use the connection here, it has been returned to the pool. 43 | }); 44 | } 45 | pool.end(); 46 | }) 47 | }) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # json2mysql 5 | 6 | 这次更新了将一个json文件中的数据导入到mysql的脚本。 是用nodejs写的。 7 | 8 | 对应的文件是tomysql.js 有兴趣的同志可以研究下。 9 | 10 | 11 | --- 12 | 13 | # pythonCollection(python代码集合) 14 | 15 | **本项目目前仅在python2.7下实验,其他版本暂不维护,请灵活使用** 16 | 17 | - 文件下载器 18 | - pdf合并工具 19 | - 极客专栏文章下载 20 | - 掘金小册下载 21 | - 新浪微博爬虫 22 | 23 | ## 文件下载器(downloader.py) 24 | 25 | >从数据库或者文件中读取文件url地址,并且下载到本地 26 | 27 | 依赖库: 28 | 29 | - requests 30 | - progressbar 31 | - MySQLdb(如果不从数据库读取数据则不需要) 32 | 33 | 34 | ## pdf合并工具(pdfmerger.py) 35 | 36 | >合并pdf文件 37 | 38 | 代码正在整理中.. 39 | 40 | ## 极客专栏文章下载(geeksplider.py) 41 | 42 | >将你购买的极客专栏,下载为json文件 43 | 44 | **2019-5-30 更新** 45 | 46 | 近期geek网站加了一个限流的限制,以及请求时间戳的一些验证。 47 | 48 | 对应的python文件就没更新。加完代理池什么的应该就可以用了。 49 | 50 | 这次采用了一个新的套路:在浏览器控制台里执行js脚本,发送请求,再将数据存为json文件进行下载。 51 | 52 | 对应的js文件是geek.js 有兴趣的同学可以研究一下哈,个人感觉还是挺有意思的。 53 | 54 | --end 55 | --- 56 | 57 | 依赖库: 58 | 59 | - requests 60 | 61 | 文件格式为: 62 | ``` 63 | { 64 | "article_title": "", 65 | "audio_time": "", 66 | "ctime": 1521993600, 67 | "audio_size": 4375851, 68 | "pid": 76, 69 | "audio_url": "", 70 | "mdhtml": "", 71 | "audio_download_url": "", 72 | "id": 4969, 73 | "article_cover": "" 74 | } 75 | ``` 76 | 77 | 可以根据自己的需求灵活改动代码进行使用. 78 | 79 | ## 掘金小册下载(juejinxiaoce.py) 80 | 81 | >将你购买的掘金小册,下载为json格式 82 | 83 | 依赖库: 84 | 85 | - requests 86 | 87 | 文件格式为: 88 | 89 | ``` 90 | { 91 | "article_title": "", 92 | "pid": "", 93 | "mdhtml": "", 94 | "mdtext": "", 95 | "id": "", 96 | "createdAt": "" 97 | } 98 | ``` 99 | 100 | ## 新浪微博爬虫(weibosplider.py) 101 | 102 | >爬取某个人的新浪微博数据 103 | 104 | 依赖库: 105 | 106 | - requests 107 | - MySQLdb(如果不保存到数据库则不需要) 108 | - logging 109 | 110 | 本脚本将数据直接存储到数据库,你也可以修改部分逻辑,存储到其他地方。 111 | 112 | **灵活使用,代码仅供参考** 113 | 114 | # issue 115 | 116 | 使用中遇到问题,可以在该项目下提issue [issues入口](https://github.com/maixiaojie/pythonCollection/issues) 117 | 118 | 119 | # PR 120 | 121 | 如果你也有好用的小工具要分享,请new pull request 122 | 123 | 124 | -------------------------------------------------------------------------------- /04_juejinxiaoce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-01-16 15:43:36 4 | # @Author : maixiaojie (tracywyj@gmail.com) 5 | # @Link : https://yk.mcust.cn 6 | # @Version : $Id$ 7 | import requests 8 | import json 9 | import codecs 10 | 11 | cookies = "" 12 | bookid = "" 13 | token = "" 14 | uid = "" 15 | client_id = "" 16 | 17 | class Tools(object): 18 | def __init__(self, cookies, bookid, token, uid, client_id): 19 | super(Tools, self).__init__() 20 | self.cookies = cookies 21 | self.token = token 22 | self.bookid = bookid 23 | self.uid = uid 24 | self.client_id = client_id 25 | self.headers = { 26 | 'Cookie': self.cookies, 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 DID:3441301122:DID', 28 | 'Referer': 'https://juejin.im', 29 | 'Origin': 'https://juejin.im', 30 | 'Host': 'xiaoce-cache-api-ms.juejin.im', 31 | 'Content-Type': 'application/json' 32 | } 33 | def getall(self): 34 | headers = self.headers 35 | url = "https://xiaoce-cache-api-ms.juejin.im/v1/get" 36 | payload = {'uid': '', 'client_id': '1548407371349', 'token': self.token, 'src': 'web', 'id': self.bookid} 37 | r = requests.get(url, params=payload, headers=headers) 38 | res = r.json() 39 | code = res.get('m') 40 | if(code == 'ok'): 41 | section = res.get('d').get('section') 42 | return section 43 | else: 44 | return [] 45 | 46 | # 获取 47 | def get_article_detail(self, sectionid): 48 | headers = self.headers 49 | url = 'https://xiaoce-cache-api-ms.juejin.im/v1/getSection' 50 | payload = {'uid': self.uid, 'client_id': self.client_id, 'token': self.token, 'src': 'web', 'sectionId': sectionid} 51 | r = response = requests.get(url, params=payload, headers=headers) 52 | res = r.json() 53 | print res 54 | print r.url 55 | code = res.get('m') 56 | if(code == 'ok'): 57 | data = res.get('d') 58 | section_dict = {} 59 | section_dict['id'] = data.get('id') 60 | section_dict['pid'] = data.get('metaId') 61 | section_dict['article_title'] = data.get('title') 62 | section_dict['mdhtml'] = data.get('html') 63 | section_dict['mdtext'] = data.get('content') 64 | section_dict['createdAt'] = data.get('createdAt') 65 | with codecs.open('book-'+str(self.bookid)+".json", "a+", "utf-8") as fp: 66 | fp.write(json.dumps(section_dict,indent=4, ensure_ascii=False)+',\r\n') 67 | 68 | def main(self): 69 | ids = self.getall() 70 | if len(ids) > 0: 71 | for id in ids: 72 | self.get_article_detail(id) 73 | 74 | tool = Tools(cookies, bookid, token, uid, client_id) 75 | tool.main() 76 | # tool.get_article_detail('5bdd0d83f265da615f76ba57') 77 | 78 | -------------------------------------------------------------------------------- /01_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-02-19 15:39:03 元宵节 4 | # @Author : maixiaojie (tracywyj@gmail.com) 5 | # @Link : https://maixiaojie.github.io 6 | # @Version : 1.0.0 7 | # 8 | ############文件下载器 python2.7######################## 9 | # 10 | # 本项目仅在python2.7下实验,下载8000+个视频,其他版本请灵活改动即可 11 | # 本项目不维护其他版本python 12 | # 13 | # 需要安装requests、progressbar、MySQLdb三个库 14 | # 15 | # `pip install requests` 16 | # `pip install progressbar` 17 | # MySQLdb -- 如果不需要读取数据库,则不需要该库 18 | 19 | import requests 20 | import urllib 21 | import os 22 | import time 23 | import progressbar 24 | import MySQLdb 25 | 26 | # 根据url获取文件名,需求不同,下面的表达式也不同,这块灵活改动 27 | def getName(path): 28 | return path.split('/')[-1] 29 | 30 | #下载文件 31 | def download(url, dirpath): 32 | # 配置你的文件请求相关的headers即可,这块灵活改动 33 | headers = { 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36", 35 | "Referer": "http://www.maiziedu.com/" 36 | } 37 | filename = getName(url) 38 | path = dirpath + '/' + filename 39 | r = requests.get(url, headers=headers, stream=True) 40 | chunk_size = 1024 41 | content_size = int(r.headers['content-length']) 42 | if r.status_code == 200: 43 | print '[File Total Size]: %0.2f Mb' % (content_size /chunk_size / 1024) 44 | print '[File Name]: ' + filename 45 | with open(path, "wb") as file: 46 | widgets = ['Progress: ', progressbar.Percentage(), ' ',progressbar.Bar(marker='#', left='[', right=']'),' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed()] 47 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=content_size).start() 48 | for data in r.iter_content(chunk_size=chunk_size): 49 | if data: 50 | file.write(data) 51 | file.flush() 52 | pbar.update(len(data) + chunk_size) 53 | pbar.finish() 54 | 55 | def main(): 56 | # 下载的所有文件都放在该文件夹下,名称可自由修改,合法即可 57 | # 会先判断是否存在该文件夹,若不存在会创建文件夹 58 | path = "./video" 59 | isExists = os.path.exists(path) 60 | if not isExists: 61 | os.makedirs(path) 62 | 63 | # 我这里的文件地址是从数据库中读取的 64 | # 你也可以灵活地改成从文件读取等其他方式 65 | db = MySQLdb.connect('ip:port', 'user', 'password', 'database', charset='utf8') 66 | cursor = db.cursor() 67 | cursor.execute("SELECT video_url from lesson_detail limit 100 offset 0 ") 68 | rs = cursor.fetchall() 69 | cursor.close() 70 | db.close() 71 | 72 | # 遍历url集合,如果对应的文件本地已经存在,则跳过该文件下载 73 | for i, url in enumerate(rs): 74 | fileurl = ''.join(url) 75 | filename = getName(fileurl) 76 | filepath = path+'/'+filename 77 | isFileExists = os.path.exists(filepath) 78 | if isFileExists: 79 | print '[msg]: ' + filename + ' has been downloaded...' 80 | else: 81 | print 'downloading ['+ str(i) +' ]' + fileurl 82 | download(fileurl, path) 83 | 84 | 85 | main() -------------------------------------------------------------------------------- /geek.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 运行在浏览器控制台中 3 | * @type {Array} 4 | */ 5 | 6 | // 要爬取的文章id集合 geek网站做了限流,所以分两次请求。当然你也可以用代理什么的 7 | var ids = [77345, 77749, 77804, 78158, 78168, 78884, 79319, 79539, 80011, 80021, 80042, 80240, 80260, 80311, 81730, 82397, 82711, 82764, 83302, 83719, 83860, 84365, 84633, 85031, 85341, 85745, 86117]; 8 | var ids2 = [86400, 86823, 87179, 87234, 87808, 88275, 88538, 88827, 89151, 89491, 89832, 90148, 90485, 90998, 91325, 91644, 92227, 92663, 93110, 93216, 93289, 93777, 94156, 94644, 94979, 95469, 95833, 96269, 96809, 97144]; 9 | // 最后获得的数据 10 | var rs = [] 11 | /** 12 | * 初始化 文档中注入FileSaver.js 并执行请求开始的方法 13 | * @return {[type]} [description] 14 | */ 15 | function init() { 16 | var src = 'https://cdn.bootcss.com/FileSaver.js/2014-11-29/FileSaver.js'; 17 | var script = document.createElement('script'); 18 | script.src = src; 19 | var heads = document.getElementsByTagName("head"); 20 | if (heads.length) 21 | heads[0].appendChild(script); 22 | else 23 | document.documentElement.appendChild(script); 24 | script.onload = function() { 25 | console.log('script loaded') 26 | start() 27 | } 28 | } 29 | // 将数据转存为json文件 30 | function downloadJson(data) { 31 | var blob = new Blob([JSON.stringify(data)], { type: "" }); 32 | saveAs(blob, "data.json"); 33 | } 34 | 35 | // 爬取一篇文章 就是一个ajax请求 36 | function fetch(id) { 37 | var data = JSON.stringify({ 38 | "include_neighbors": "false", 39 | "id": id 40 | }); 41 | var xhr = new XMLHttpRequest(); 42 | xhr.withCredentials = true; 43 | xhr.addEventListener("readystatechange", function() { 44 | if (this.readyState === 4) { 45 | var res = JSON.parse(this.responseText); 46 | if (res.code == 0) { 47 | var data = res.data; 48 | var item = { 49 | id: data.id, 50 | pid: data.cid, 51 | article_content: data.article_content, 52 | article_cover: data.article_cover, 53 | article_ctime: data.article_ctime, 54 | article_title: data.article_title, 55 | audio_download_url: data.audio_download_url, 56 | audio_size: data.audio_size, 57 | audio_time: data.audio_time, 58 | audio_url: data.audio_url 59 | } 60 | rs.push(item); 61 | // 如果当前是最后一个就下载文件 62 | if (id == ids2[ids2.length - 1]) { 63 | downloadJson(rs) 64 | } 65 | } 66 | } 67 | }); 68 | 69 | xhr.open("POST", "https://time.geekbang.org/serv/v1/article"); 70 | xhr.setRequestHeader("content-type", "application/json"); 71 | xhr.send(data); 72 | } 73 | 74 | function start() { 75 | // 依次请求 76 | for (var i = 0; i < ids2.length; i++) { 77 | (function(i) { 78 | setInterval(fetch(ids2[i]), 3000) 79 | })(i) 80 | } 81 | } 82 | init() 83 | -------------------------------------------------------------------------------- /03_geeksplider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2019-01-16 15:43:36 4 | # @Author : maixiaojie (tracywyj@gmail.com) 5 | # @Link : https://yk.mcust.cn 6 | # @Version : $Id$ 7 | import requests 8 | import json 9 | import codecs 10 | 11 | # **************************************************** 12 | # *python环境,目前2.7版本没问题,3版本暂未测试 13 | # *依赖库: requests `pip install requests` 14 | # *最后会生成一个json文件 15 | # **************************************************** 16 | # 17 | # 极客时间官网,登录后--f12--Netword--任意点开该域名下的一个请求--Headers--Request Headers--Cookies 18 | # 复制所有的cookies 19 | # https://time.geekbang.org/ 20 | cookies = "改成你的cookies" 21 | # 专栏的id,修改成你要获取的专栏的id即可 22 | # 如https://time.geekbang.org/column/154, 该专栏的id为154 23 | id = '48' 24 | 25 | class Tools(object): 26 | def __init__(self, cookies, columnid): 27 | super(Tools, self).__init__() 28 | self.cookies = cookies 29 | self.columnid = columnid 30 | self.headers = { 31 | 'Cookie': self.cookies, 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 DID:3441301122:DID', 33 | 'Referer': 'https://time.geekbang.org', 34 | 'Origin': 'https://time.geekbang.org', 35 | 'Host': 'time.geekbang.org', 36 | 'Content-Type': 'application/json' 37 | } 38 | def getall(self): 39 | headers = self.headers 40 | url = 'https://time.geekbang.org/serv/v1/my/products/all' 41 | r = response = requests.post(url, headers=headers) 42 | # print r.text 43 | # 获取专栏的文章列表 44 | def get_article_list(self): 45 | headers = self.headers 46 | url = 'https://time.geekbang.org/serv/v1/column/articles' 47 | payload = {'cid': self.columnid, 'order': 'earliest', 'prev': 0, 'sample': 'true', 'size': 200} 48 | r = response = requests.post(url, data=json.dumps(payload), headers=headers) 49 | res = r.json() 50 | code = res.get('code') 51 | if code >= 0: 52 | lists = res.get('data').get('list') 53 | ids_list = [] 54 | for item in lists: 55 | ids_list.append(item.get('id')) 56 | return ids_list 57 | else: 58 | return [] 59 | # 获取单个文章的数据 60 | def get_article_detail(self, article_id): 61 | headers = self.headers 62 | url = 'https://time.geekbang.org/serv/v1/article' 63 | payload = {'id': article_id, 'include_neighbors': 'false'} 64 | r = response = requests.post(url, data=json.dumps(payload), headers=headers) 65 | res = r.json() 66 | code = res.get('code') 67 | if code >= 0: 68 | # print 'id为'+ str(article_id)+'的文章数据获取成功' 69 | lists = res.get('data') 70 | article_dict = {} 71 | article_dict['id'] = res.get('data').get('id') 72 | article_dict['pid'] = res.get('data').get('cid') 73 | article_dict['article_title'] = res.get('data').get('article_title') 74 | article_dict['article_cover'] = res.get('data').get('article_cover') 75 | article_dict['audio_download_url'] = res.get('data').get('audio_download_url') 76 | article_dict['audio_url'] = res.get('data').get('audio_url') 77 | article_dict['audio_size'] = res.get('data').get('audio_size') 78 | article_dict['audio_time'] = res.get('data').get('audio_time') 79 | article_dict['mdhtml'] = res.get('data').get('article_content') 80 | article_dict['ctime'] = res.get('data').get('article_ctime') 81 | with codecs.open('articles'+str(self.columnid)+".json", "a+", "utf-8") as fp: 82 | fp.write(json.dumps(article_dict,indent=4, ensure_ascii=False)+',\r\n') 83 | else: 84 | pass 85 | # print '获取失败' 86 | def main(self): 87 | ids = self.get_article_list() 88 | if len(ids) > 0: 89 | for id in ids: 90 | self.get_article_detail(id) 91 | 92 | 93 | tool = Tools(cookies, id) 94 | tool.main() -------------------------------------------------------------------------------- /05_weibosplider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2018-11-01 09:43:36 4 | # @Author : maixiaojie (tracywyj@gmail.com) 5 | # @Link : https://maixiaojie.github.io 6 | # @Version : $Id$ 7 | 8 | import os 9 | import requests 10 | import json 11 | import re 12 | import math 13 | import MySQLdb 14 | import time 15 | import datetime 16 | import codecs 17 | import logging 18 | 19 | mylogger = logging.getLogger('mylogger') 20 | mylogger.setLevel(logging.DEBUG) 21 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(filename)s- %(levelname)s - %(message)s') 22 | fh = logging.FileHandler('logout.log') 23 | fh.setLevel(logging.DEBUG) 24 | fh.setFormatter(formatter) 25 | 26 | mylogger.addHandler(fh) 27 | 28 | dbWB = MySQLdb.connect('ip:port', 'user', 'password', 'database', charset='utf8') 29 | cursorWB = dbWB.cursor() 30 | cursorWB.execute('SET NAMES utf8mb4') 31 | class Tool: 32 | removeImg = re.compile('') 33 | removeAddr = re.compile('') 34 | replaceLine = re.compile('|
|
|

') 35 | removeTag = re.compile('<.*?>') 36 | url_regex = re.compile('http://.*?/\w{7}') # http://t.cn/RdhOUUu 37 | s_regex = re.compile('u\w{3,4}') # u200b/u3000 38 | face_regex = re.compile('\[\w+\]') # 表情:[开心] 39 | #self是实例方法 cls是类方法 40 | @classmethod 41 | def replace(cls,x): 42 | x=re.sub(cls.removeImg,'',x) 43 | x=re.sub(cls.removeAddr,'',x) 44 | x=re.sub(cls.replaceLine,'',x) 45 | x=re.sub(cls.removeTag,'',x) 46 | x=re.sub(cls.url_regex,'',x) 47 | # x=re.sub(cls.s_regex,'',x) 48 | return x.strip() #去掉多余的空格 49 | # return x 50 | 51 | class Weibo(object): 52 | def get_total(self,id,page): 53 | url = 'https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page) 54 | response = requests.get(url) 55 | print url 56 | ob_json = json.loads(response.text) 57 | # print ob_json 58 | status = ob_json.get('ok') 59 | # print status 60 | mylogger.info(status) 61 | if status == 1: 62 | totalData = ob_json.get('data').get('cardlistInfo').get('total') 63 | total_page = int(math.ceil(totalData / 10)) + 1 64 | mylogger.info('一共' + str(total_page) + '页微博...') 65 | mylogger.info('一共' + str(totalData) + '条微博...') 66 | sql = 'insert into wb_splider(`uid`, `splider_status`, `ctime`) values(%s, %s, %s)' 67 | dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 68 | cursorWB.execute(sql, [id, '112', dt]) 69 | dbWB.commit() 70 | # print '一共' + str(total_page) + '页微博...' 71 | # print '一共' + str(totalData) + '条微博...' 72 | return total_page 73 | else: 74 | mylogger.info(id + '的数据为空') 75 | # print '数据为空' 76 | # sql = 'insert into wb_splider(`uid`, `splider_status`) values(%s, %s)' 77 | # cursorWB.execute(sql, [id, '3']) 78 | return 0 79 | 80 | def get_weibo(self,id,page): 81 | url = 'https://m.weibo.cn/api/container/getIndex?uid={}&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page) 82 | response = requests.get(url) 83 | print url 84 | print '正在爬取...' + str(response.url) 85 | ob_json = json.loads(response.text) 86 | list_cards = ob_json.get('data').get('cards') 87 | mylogger.info('第' + str(page) + '页数据获取成功...') 88 | # print '第' + str(page) + '页数据获取成功...' 89 | return list_cards 90 | 91 | def write2file(self, data, filename): 92 | with codecs.open(filename, 'a+', 'utf-8') as f: 93 | f.write(data + '\r\n') 94 | 95 | def handle_cardlist(self, list_cards, uid): 96 | if list_cards != None: 97 | for card in list_cards: 98 | if card.get('card_type') == 9: 99 | wb_id = card.get('mblog').get('id') 100 | id = card.get('mblog').get('bid') 101 | source = card.get('mblog').get('source') 102 | reposts_count = card.get('mblog').get('reposts_count') 103 | comments_count = card.get('mblog').get('comments_count') 104 | attitudes_count = card.get('mblog').get('attitudes_count') 105 | text=card.get('mblog').get('text') 106 | pre_text = Tool.replace(text) 107 | retweeted_status = card.get('mblog').get('retweeted_status') 108 | created_at = card.get('mblog').get('created_at') 109 | # print pre_text 110 | if retweeted_status: 111 | is_retweeted = 1 112 | else: 113 | is_retweeted = 0 114 | sql = "insert into wb(`id`, `uid`, `wb_id`, `text`, `source`,`reposts_count`,`comments_count`,`attitudes_count`,`is_retweeted`,`created_at`, `origin_text`) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 115 | try: 116 | self.write2file(pre_text, './userdata/'+uid+'.txt') 117 | row_count = cursorWB.execute(sql, [id, uid, wb_id, pre_text, source, reposts_count, comments_count, attitudes_count, is_retweeted, created_at, text]) 118 | except BaseException as t: 119 | print t 120 | print wb_id 121 | mylogger.error(t) 122 | finally: 123 | 124 | pass 125 | 126 | def main(self,uid): 127 | total_sum = self.get_total(uid, 1) 128 | if total_sum > 10: 129 | total_sum = 10 130 | for i in range(total_sum): 131 | list_cards = self.get_weibo(uid,i+1) 132 | self.handle_cardlist(list_cards, uid) 133 | 134 | sql = 'insert into wb_splider(`uid`, `splider_status`, `ctime`) values(%s, %s, %s)' 135 | dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 136 | if total_sum == 0: 137 | cursorWB.execute(sql, [uid, '3', dt]) 138 | mylogger.info(uid + ' total_sum = 0') 139 | else: 140 | cursorWB.execute(sql, [uid, '2', dt]) 141 | mylogger.info(uid + ' total_sum != 0') 142 | dbWB.commit() 143 | # cursorWB.close() 144 | # dbWB.close() 145 | 146 | if __name__ == '__main__': 147 | weibo=Weibo() 148 | # 6072592521 2401890571 1992968110 5195667944 149 | weibo.main('5195667944') 150 | 151 | # uid = sys.argv[1] 152 | # if uid != '': 153 | # weibo.main(uid) --------------------------------------------------------------------------------