├── __init__.py ├── README.md ├── insertdate.py ├── sql.py └── dataSource.py /__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 7/10 0010 18:09 4 | # @Author : liya 5 | # @Site : 6 | # @File : __init__.py.py 7 | 8 | 9 | # from dataSource import getData 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # taobaosale 2 | 淘宝优惠券爬虫 3 | 4 | 爬虫 +  mysql 5 | 6 | ##Config文件内容 7 | 8 | DBHOST = "ip" 9 | 10 | DBPORT = 3306 11 | 12 | DBUSER = "user" 13 | 14 | DBPWD = "pwd" 15 | 16 | DBNAME = "dbname" 17 | 18 | DBCHAR = "utf8" 19 | -------------------------------------------------------------------------------- /insertdate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 7/5 0005 17:13 4 | # @Author : liya 5 | # @Site : 6 | # @File : insertdate.py 7 | import threading 8 | 9 | from _mysql_exceptions import IntegrityError 10 | 11 | import dataSource 12 | from sql import Mysql 13 | import time 14 | 15 | def insert(): 16 | 17 | mysql = Mysql() 18 | sqlAll = "insert into taobaoSale(`itemId`,`link`,`title`,`subtitle`,`intro`,`imagePath`,`staticImgPath`,`imagePaths`,`sellPrice`,`price`,`itemUrl`,`descUrl`,`planUrl`,`ulandUrl`,`historySales`,`viewCount`,`sellerId`,`sellerType`,`sellerName`,`flagShip`,`certIcon`,`cpId`,`cpPrice`,`cpSpare`,`cpCount`,`cpTotal`,`cpCondition`,`cpLimit`,`cpStarts`,`cpExpired`,`cpLevel`,`cpUrl`,`gold`,`ju`,`qiang`,`freeExpress`,`freeExpressBack`,`commission`,`cgold`,`goodRatePercentage`,`dx`,`is_brand`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 19 | cate = ['7','2','1','9','8','5','10','4','6','3','99'] 20 | 21 | for cateid in cate: 22 | print cateid 23 | 24 | i = 1 25 | while True: 26 | print '------当前进度--------:第%s页' % i 27 | param = dataSource.getData(i,cateid,mysql) 28 | i = i + 1 29 | if param == ['none']: 30 | print '--------当前%s页为终结页' % (i-1) 31 | break 32 | else: 33 | try: 34 | result = mysql.insertMany(sqlAll, param) 35 | mysql.end() 36 | except IntegrityError: 37 | print '--------warn当前%s页出现异常' % i 38 | continue 39 | finally: 40 | # 续1s 41 | time.sleep(1) 42 | mysql.dispose() 43 | 44 | global timer 45 | timer = threading.Timer(43200, insert) 46 | timer.start() 47 | 48 | 49 | 50 | if __name__ == '__main__': 51 | 52 | timer = threading.Timer(1, insert) 53 | timer.start() 54 | insert() 55 | 56 | -------------------------------------------------------------------------------- /sql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 7/7 0007 10:45 4 | # @Author : liya 5 | # @Site : 6 | # @File : sql.py 7 | 8 | import MySQLdb 9 | from MySQLdb.cursors import DictCursor 10 | from DBUtils.PooledDB import PooledDB 11 | # from PooledDB import PooledDB 12 | import Config 13 | 14 | """ 15 | Config是一些数据库的配置文件 16 | """ 17 | 18 | 19 | class Mysql(object): 20 | """ 21 | MYSQL数据库对象,负责产生数据库连接, 此类中的连接采用连接池实现获取连接对象:conn = Mysql.getConn() 22 | 释放连接对象; 23 | conn.close() 24 | 或del 25 | conn 26 | """ 27 | 28 | 29 | # 连接池对象 30 | __pool = None 31 | 32 | 33 | def __init__(self): 34 | # 数据库构造函数,从连接池中取出连接,并生成操作游标 35 | self._conn = Mysql.__getConn() 36 | self._cursor = self._conn.cursor() 37 | 38 | 39 | @staticmethod 40 | def __getConn(): 41 | if Mysql.__pool is None: 42 | __pool = PooledDB(creator=MySQLdb, mincached=1, maxcached=20, 43 | host=Config.DBHOST, port=Config.DBPORT, user=Config.DBUSER, passwd=Config.DBPWD, 44 | db=Config.DBNAME, use_unicode=False, charset=Config.DBCHAR, cursorclass=DictCursor) 45 | return __pool.connection() 46 | 47 | def getOne(self, sql, param=None): 48 | """ 49 | @summary: 执行查询,并取出第一条 50 | @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来 51 | @param param: 可选参数,条件列表值(元组/列表) 52 | @return: result list/boolean 查询到的结果集 53 | """ 54 | if param is None: 55 | count = self._cursor.execute(sql) 56 | else: 57 | count = self._cursor.execute(sql, param) 58 | if count > 0: 59 | result = self._cursor.fetchone() 60 | else: 61 | result = False 62 | return result 63 | 64 | def insertOne(self, sql, value): 65 | """ 66 | @summary: 向数据表插入一条记录 67 | @param sql:要插入的SQL格式 68 | @param value:要插入的记录数据tuple/list 69 | @return: insertId 受影响的行数 70 | """ 71 | count = self._cursor.execute(sql, value) 72 | return count 73 | 74 | 75 | def insertMany(self, sql, values): 76 | """ 77 | @summary: 向数据表插入多条记录 78 | @param sql:要插入的SQL格式 79 | @param values:要插入的记录数据tuple(tuple)/list[list] 80 | @return: count 受影响的行数 81 | """ 82 | count = self._cursor.executemany(sql, values) 83 | return count 84 | 85 | def begin(self): 86 | """ 87 | @summary: 开启事务 88 | """ 89 | self._conn.autocommit(0) 90 | 91 | def end(self, option='commit'): 92 | """ 93 | @summary: 结束事务 94 | """ 95 | if option == 'commit': 96 | self._conn.commit() 97 | else: 98 | self._conn.rollback() 99 | 100 | def dispose(self, isEnd=1): 101 | """ 102 | @summary: 释放连接池资源 103 | """ 104 | if isEnd == 1: 105 | self.end('commit') 106 | else: 107 | self.end('rollback'); 108 | self._cursor.close() 109 | self._conn.close() -------------------------------------------------------------------------------- /dataSource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 7/10 0010 17:55 4 | # @Author : liya 5 | # @Site : 6 | # @File : dataSource.py 7 | import re 8 | import sys 9 | import json 10 | import requests 11 | 12 | 13 | def getData(index,cate,mysql): 14 | reload(sys) 15 | sys.setdefaultencoding('utf8') 16 | 17 | payload = {'page': str(index), 'submit': '1', 'nav': 'tm', 'cate': str(cate), 'sort': 'couponValue', 'starttime': '0', 'inajax': '1'} 18 | url = 'http://fmgrgg.agent.yqjuejin.com' 19 | cookies = dict(sid='3iu0c8gq0lk52dph0ne6rfu2h2') 20 | headers = {'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate', 21 | 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 22 | 'Cookie': 'sid=3iu0c8gq0lk52dph0ne6rfu2h2', 'Host': 'fmgrgg.agent.yqjuejin.com', 23 | 'Pragma': 'no-cache', 24 | 'Referer': 'http://fmgrgg.agent.yqjuejin.com/?page=1&submit=1&nav=tm&cate=0&sort=zh&starttime=0&end=pc', 25 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 26 | 'X-Requested-With': 'XMLHttpRequest'} 27 | html = requests.get(url, params=payload, cookies=cookies, headers=headers) 28 | 29 | ##字符处理 30 | jsonh = html.text 31 | 32 | param = [] 33 | try: 34 | jsons = json.loads(re.sub(r",\s*?]", "]", jsonh)) 35 | jsonss = jsons['list'] 36 | if jsonss == '': 37 | return ['none'] 38 | except ValueError: 39 | print '--------warn当前%s页出现异常' % index 40 | return param 41 | 42 | sql = 'SELECT id FROM taobaoSale WHERE itemId = %s' 43 | 44 | for jsonhh in jsonss: 45 | itemId = str(jsonhh['itemId']).encode("utf-8") 46 | 47 | ## 检验数据是否存在 48 | result = mysql.getOne(sql,itemId) 49 | if result != False: 50 | print "%s,该号商品已存在" % itemId 51 | continue 52 | 53 | link = str(jsonhh['link']).encode("utf-8") 54 | title = str(jsonhh['title']).encode("utf-8") 55 | subtitle = str(jsonhh['subtitle']).encode("utf-8") 56 | intro = str(jsonhh['intro']).encode("utf-8") 57 | imagePath = str(jsonhh['imagePath']).encode("utf-8") 58 | staticImgPath = str(jsonhh['staticImgPath']).encode("utf-8") 59 | imagePaths = str(jsonhh['imagePaths']).encode("utf-8") 60 | sellPrice = str(jsonhh['sellPrice']).encode("utf-8") 61 | price = str(jsonhh['price']).encode("utf-8") 62 | itemUrl = str(jsonhh['itemUrl']).encode("utf-8") 63 | descUrl = str(jsonhh['descUrl']).encode("utf-8") 64 | planUrl = str(jsonhh['planUrl']).encode("utf-8") 65 | ulandUrl = str(jsonhh['ulandUrl']).encode("utf-8") 66 | historySales = str(jsonhh['historySales']).encode("utf-8") 67 | viewCount = str(jsonhh['viewCount']).encode("utf-8") 68 | sellerId = str(jsonhh['sellerId']).encode("utf-8") 69 | sellerType = str(jsonhh['sellerType']).encode("utf-8") 70 | sellerName = str(jsonhh['sellerName']).encode("utf-8") 71 | flagShip = str(jsonhh['flagShip']).encode("utf-8") 72 | certIcon = str(jsonhh['certIcon']).encode("utf-8") 73 | cpId = str(jsonhh['cpId']).encode("utf-8") 74 | cpPrice = str(jsonhh['cpPrice']).encode("utf-8") 75 | cpSpare = str(jsonhh['cpSpare']).encode("utf-8") 76 | cpCount = str(jsonhh['cpCount']).encode("utf-8") 77 | cpTotal = str(jsonhh['cpTotal']).encode("utf-8") 78 | cpCondition = str(jsonhh['cpCondition']).encode("utf-8") 79 | cpLimit = str(jsonhh['cpLimit']).encode("utf-8") 80 | cpStarts = str(jsonhh['cpStarts']).encode("utf-8") 81 | cpExpired = str(jsonhh['cpExpired']).encode("utf-8") 82 | cpLevel = str(jsonhh['cpLevel']).encode("utf-8") 83 | cpUrl = str(jsonhh['cpUrl']).encode("utf-8") 84 | gold = str(jsonhh['gold']).encode("utf-8") 85 | ju = str(jsonhh['ju']).encode("utf-8") 86 | qiang = str(jsonhh['qiang']).encode("utf-8") 87 | freeExpress = str(jsonhh['freeExpress']).encode("utf-8") 88 | freeExpressBack = str(jsonhh['freeExpressBack']).encode("utf-8") 89 | commission = str(jsonhh['commission']).encode("utf-8") 90 | cgold = str(jsonhh['cgold']).encode("utf-8") 91 | goodRatePercentage = str(jsonhh['goodRatePercentage']).encode("utf-8") 92 | dx = str(jsonhh['dx']).encode("utf-8") 93 | is_brand = str(jsonhh['is_brand']).encode("utf-8") 94 | 95 | param.append([ 96 | itemId, link, title, subtitle, intro, imagePath, staticImgPath, imagePaths, sellPrice, price, itemUrl, 97 | descUrl, 98 | planUrl, 99 | ulandUrl, historySales, viewCount, sellerId, sellerType, sellerName, flagShip, certIcon, cpId, cpPrice, 100 | cpSpare, 101 | cpCount, cpTotal, cpCondition, cpLimit, cpStarts, cpExpired, cpLevel, cpUrl, gold, ju, qiang, 102 | freeExpress, 103 | freeExpressBack, commission, cgold, goodRatePercentage, dx, is_brand]) 104 | return param 105 | --------------------------------------------------------------------------------