├── .gitignore ├── LICENSE ├── config.py ├── jd ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── jd_comment.py ├── main.py ├── runspider.py ├── scrapy.cfg ├── sqlhelper.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | test 91 | *test.py 92 | migrations 93 | *.png 94 | #*.iml 95 | #*.xml 96 | .idea 97 | headers.py 98 | *.html -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 guangquan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # local 4 | database_config = { 5 | 'host': 'localhost', 6 | 'port': 3306, 7 | 'user': 'root', 8 | 'password': '123456', 9 | 'charset': 'utf8', 10 | } 11 | 12 | database = 'jd' 13 | jd_item_table = 'item' 14 | 15 | redis_pass = '' 16 | redis_host = 'localhost' 17 | redis_part = '6379' 18 | redis_db = 10 19 | 20 | domain = 'http://127.0.0.1:8000/' 21 | 22 | process_count = 3 23 | -------------------------------------------------------------------------------- /jd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awolfly9/jd_comment/3fee1067b191b61be57e423c16550db3256a494f/jd/__init__.py -------------------------------------------------------------------------------- /jd/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JdItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /jd/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JdPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /jd/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jd project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jd' 13 | 14 | SPIDER_MODULES = ['jd.spiders'] 15 | NEWSPIDER_MODULE = 'jd.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'jd (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | # DOWNLOAD_DELAY = 3 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'jd.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'jd.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'jd.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | # CONCURRENT_REQUESTS = 1 93 | LOG_ENABLED = True 94 | DOWNLOAD_TIMEOUT = 20 -------------------------------------------------------------------------------- /jd/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jd/spiders/jd_comment.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import sys 4 | import chardet 5 | import re 6 | import json 7 | import datetime 8 | import config 9 | import utils 10 | import redis 11 | 12 | from scrapy.http.cookies import CookieJar 13 | from scrapy.utils.project import get_project_settings 14 | from scrapy import Spider 15 | from scrapy import Request 16 | from sqlhelper import SqlHelper 17 | 18 | reload(sys) 19 | sys.setdefaultencoding('utf-8') 20 | 21 | 22 | class JDSpider(Spider): 23 | name = 'jd_comment' 24 | 25 | def __init__(self, name = None, **kwargs): 26 | super(JDSpider, self).__init__(name, **kwargs) 27 | self.product_id = kwargs.get('product_id', -1) 28 | self.log('product_id:%s' % self.product_id) 29 | self.item_table = 'item_%s' % self.product_id 30 | self.product_page = '%s_page' % self.product_id 31 | 32 | self.log_dir = 'log/%s' % self.product_id 33 | self.is_record_page = False 34 | if self.is_record_page: 35 | utils.make_dir(self.log_dir) 36 | 37 | self.sql = SqlHelper() 38 | self.red = redis.StrictRedis(host = config.redis_host, port = config.redis_part, db = config.redis_db, 39 | password = config.redis_pass) 40 | 41 | def start_requests(self): 42 | while self.red.llen(self.product_id) > 0: 43 | data = self.red.lpop(self.product_id) 44 | if data == None: 45 | continue 46 | 47 | info = json.loads(data) 48 | url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \ 49 | '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page={page}&pageSize=10' \ 50 | '&isShadowSku=0'. \ 51 | format(product_id = self.product_id, comment_version = info.get('comment_version'), 52 | sort_type = info.get('sort_type'), page = info.get('page')) 53 | 54 | yield Request( 55 | url = url, 56 | headers = { 57 | 'Accept': '*/*', 58 | 'Accept-Encoding': 'gzip, deflate, br', 59 | 'Accept-Language': 'en-US,en;q=0.5', 60 | 'Connection': 'keep-alive', 61 | 'Host': 'club.jd.com', 62 | 'Referer': 'https://item.jd.com/%s.html' % self.product_id, 63 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 64 | 'Firefox/52.0', 65 | }, 66 | method = 'GET', 67 | meta = { 68 | 'page': info.get('page') 69 | }, 70 | dont_filter = True, 71 | callback = self.parse_comment 72 | ) 73 | 74 | def parse_comment(self, response): 75 | self.save_page('%s_%s.html' % (self.product_id, response.meta.get('page')), response.body) 76 | 77 | detect = chardet.detect(response.body) 78 | encoding = detect.get('encoding', '') 79 | body = response.body.decode(encoding, 'ignore') 80 | 81 | pattern = re.compile('\((.*?)\);', re.S) 82 | item = re.search(pattern, body) 83 | if item != None and item.group(1) != None: 84 | data = json.loads(item.group(1)) 85 | comments = data.get('comments', []) 86 | for comment in comments: 87 | id = comment.get('id') # 评论的 id 88 | content = comment.get('content') # 评论的内容 89 | creation_time = comment.get('creationTime', '') # 评论创建的时间 90 | reply_count = comment.get('replyCount', '') # 回复数量 91 | score = comment.get('score', '') # 评星 92 | useful_vote_count = comment.get('usefulVoteCount', '') # 其他用户觉得有用的数量 93 | useless_vote_count = comment.get('uselessVoteCount', '') # 其他用户觉得无用的数量 94 | user_level_id = comment.get('userLevelId', '') # 评论用户等级的 id 95 | user_province = comment.get('userProvince', '') # 用户的省份 96 | nickname = comment.get('nickname', '') # 评论用户的昵称 97 | product_color = comment.get('productColor', '') # 商品的颜色 98 | product_size = comment.get('productSize', '') # 商品的大小 99 | user_level_name = comment.get('userLevelName', '') # 评论用户的等级 100 | user_client = comment.get('userClient', '') # 用户评价平台 101 | user_client_show = comment.get('userClientShow', '') # 用户评价平台 102 | is_mobile = comment.get('isMobile', '') # 是否是在移动端完成的评价 103 | days = comment.get('days', '') # 购买后评论的天数 104 | reference_time = comment.get('referenceTime', '') # 购买的时间 105 | after_days = comment.get('afterDays', '') # 购买后再次评论的天数 106 | images_count = len(comment.get('images', [])) # 评论总图片的数量 107 | after_user_comment = comment.get('afterUserComment', '') 108 | if after_user_comment != '' and after_user_comment != None: 109 | ip = after_user_comment.get('ip', '') # 再次评论的 ip 地址 110 | 111 | h_after_user_comment = after_user_comment.get('hAfterUserComment', '') 112 | after_content = h_after_user_comment.get('content', '') # 再次评论的内容 113 | else: 114 | ip = '' 115 | after_content = '' 116 | 117 | content = content.replace('\'', '') 118 | after_content = after_content.replace('\'', '') 119 | 120 | msg = { 121 | 'id': id, 122 | 'content': content, 123 | 'creation_time': creation_time, 124 | 'reply_count': reply_count, 125 | 'score': score, 126 | 'useful_vote_count': useful_vote_count, 127 | 'useless_vote_count': useless_vote_count, 128 | 'user_level_id': user_level_id, 129 | 'user_province': user_province, 130 | 'nickname': nickname, 131 | 'product_color': product_color, 132 | 'product_size': product_size, 133 | 'user_level_name': user_level_name, 134 | 'user_client': user_client, 135 | 'user_client_show': user_client_show, 136 | 'is_mobile': is_mobile, 137 | 'days': days, 138 | 'reference_time': reference_time, 139 | 'after_days': after_days, 140 | 'images_count': images_count, 141 | 'ip': ip, 142 | 'after_content': after_content, 143 | 'save_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 144 | } 145 | 146 | self.sql.insert_json(msg, self.item_table) 147 | 148 | self.sql.commit() 149 | 150 | # 减少 page 标识 151 | page = self.red.get(self.product_page) 152 | if page != None: 153 | self.red.set(self.product_page, int(page) - 1) 154 | 155 | def save_page(self, filename, data): 156 | if self.is_record_page: 157 | with open('%s/%s' % (self.log_dir, filename), 'w') as f: 158 | f.write(data) 159 | f.close() 160 | 161 | def close(spider, reason): 162 | # 事务提交数据 163 | spider.sql.commit() 164 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | import logging 6 | import requests 7 | import redis 8 | import config 9 | import time 10 | import subprocess 11 | import utils 12 | 13 | if __name__ == '__main__': 14 | if not os.path.exists('log'): 15 | os.makedirs('log') 16 | 17 | logging.basicConfig( 18 | filename = 'log/%s.log' % 'main', 19 | format = '%(levelname)s %(asctime)s: %(message)s', 20 | level = logging.DEBUG 21 | ) 22 | 23 | url = '%sjd/register_spider' % config.domain 24 | r = requests.get(url = url) 25 | data = json.loads(r.text) 26 | utils.log('register_spider data:%s' % data) 27 | guid = data.get('guid', -1) 28 | if guid == -1: 29 | utils.log('register_spider ERROR not get guid') 30 | else: 31 | red = redis.StrictRedis(host = config.redis_host, port = config.redis_part, db = config.redis_db, 32 | password = config.redis_pass) 33 | process_list = [] 34 | product_ids = [] 35 | while True: 36 | product_id = red.lpop(guid) 37 | if product_id == None: 38 | time.sleep(0.5) 39 | continue 40 | 41 | product_ids.append(product_id) 42 | utils.log('start crawl spider product_id:%s' % product_id) 43 | for i in range(config.process_count): 44 | popen = subprocess.Popen('cd {dir};python runspider.py {param}'.format( 45 | dir = os.getcwd(), 46 | param = 'jd_comment %s' % product_id), 47 | shell = True) 48 | data = { 49 | 'product_id': product_id, 50 | 'popen': popen, 51 | } 52 | 53 | process_list.append(data) 54 | 55 | # 删除 guid 56 | url = '%sjd/delete_spider?guid=%s' % (config.domain, guid) 57 | r = requests.get(url = url) 58 | utils.log(r.text) 59 | -------------------------------------------------------------------------------- /runspider.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | import sys 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from scrapy.utils.log import configure_logging 9 | from scrapy.utils.project import get_project_settings 10 | 11 | 12 | def runspider(name, product_id): 13 | configure_logging(install_root_handler = False) 14 | logging.basicConfig( 15 | filename = 'log/%s.log' % product_id, 16 | format = '%(levelname)s %(asctime)s: %(message)s', 17 | level = logging.DEBUG 18 | ) 19 | process = CrawlerProcess(get_project_settings()) 20 | try: 21 | logging.info('runscrapy start spider:%s' % name) 22 | data = { 23 | 'product_id': product_id 24 | } 25 | process.crawl(name, **data) 26 | process.start() 27 | except Exception, e: 28 | logging.error('runscrapy spider:%s exception:%s' % (name, e)) 29 | pass 30 | 31 | logging.info('finish this spider:%s\n\n' % name) 32 | 33 | 34 | if __name__ == '__main__': 35 | print(sys.argv) 36 | name = sys.argv[1] or 'jd_comment' 37 | product_id = sys.argv[2] or '-1' 38 | print('name:%s' % name) 39 | print ('project dir:%s' % os.getcwd()) 40 | if product_id == -1: 41 | print('ERROR not get product_id') 42 | else: 43 | runspider(name, product_id) 44 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jd.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jd 12 | -------------------------------------------------------------------------------- /sqlhelper.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import logging 4 | import pymysql 5 | import utils 6 | import config 7 | 8 | 9 | class SqlHelper(object): 10 | def __init__(self): 11 | self.conn = pymysql.connect(**config.database_config) 12 | self.cursor = self.conn.cursor() 13 | 14 | try: 15 | self.conn.select_db(config.database) 16 | except: 17 | # self.conn.select_db(config.database) 18 | 19 | utils.log('ERROR select db error') 20 | 21 | def init(self): 22 | # 创建商品抓取记录表 23 | command = ( 24 | "CREATE TABLE IF NOT EXISTS {} (" 25 | "`id` BIGINT (15) NOT NULL AUTO_INCREMENT," # 商品 id 26 | "`name` CHAR(200) NOT NULL," # 商品名称 27 | "`average_score` INT(2) DEFAULT NULL," # 综合评分星级 28 | "`good_count` INT(7) DEFAULT NULL ," # 好评数量 29 | "`good_rate` FLOAT DEFAULT NULL," # 好评的比例 30 | "`general_count` INT(4) DEFAULT NULL," # 中评数量 31 | "`general_rate` FLOAT DEFAULT NULL," # 中评比例 32 | "`poor_count` INT(4) DEFAULT NULL," # 差评数量 33 | '`poor_rate` FLOAT DEFAULT NULL,' # 差评比例 34 | '`after_count` INT(5) DEFAULT NULL,' # 追评数量 35 | '`good_rate_style` INT(7) DEFAULT NULL,' # 36 | "`poor_rate_style` INT(5) DEFAULT NULL," # 37 | "`general_rate_style` INT(5) DEFAULT NULL," # 38 | "`comment_count` INT(7) DEFAULT NULL," # 总共评论数量 39 | "`product_id` BIGINT(15) DEFAULT NULL," # 商品 id 40 | "`good_rate_show` INT(3) DEFAULT NULL," # 显示的好评百分比 41 | "`poor_rate_show` INT(3) DEFAULT NULL," # 显示的差评百分比 42 | "`general_rate_show` INT(7) DEFAULT NULL," # 显示中评的百分比 43 | "`url` TEXT NOT NULL," # 网站 44 | "`save_time` TIMESTAMP NOT NULL," # 抓取数据的时间 45 | "PRIMARY KEY(id)" 46 | ") ENGINE=InnoDB".format(config.jd_item_table)) 47 | self.create_table(command) 48 | 49 | # 创建分析商品评论结果表 50 | command = ( 51 | "CREATE TABLE IF NOT EXISTS {} (" 52 | "`id` INT(5) NOT NULL AUTO_INCREMENT," # 自增 id 53 | "`product_id` BIGINT(15) DEFAULT NULL ," # 商品 id 54 | "`info` CHAR(255) DEFAULT NULL," # 分析结果的信息 55 | "`type` CHAR(10) DEFAULT NULL," # 分析结果类型 56 | "`guid` CHAR(40) NOT NULL," # guid 57 | "`save_time` TIMESTAMP NOT NULL," # 分析数据的时间 58 | "PRIMARY KEY(id)" 59 | ") ENGINE=InnoDB".format(config.analysis_item_table)) 60 | self.create_table(command) 61 | 62 | def create_database(self): 63 | try: 64 | command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % config.database 65 | # utils.log('sql helper create_database command:%s' % command) 66 | self.cursor.execute(command) 67 | except Exception, e: 68 | utils.log('sql helper create_database exception:%s' % str(e), logging.WARNING) 69 | 70 | def create_table(self, command): 71 | try: 72 | # utils.log('sql helper create_table command:%s' % command) 73 | self.cursor.execute(command) 74 | self.conn.commit() 75 | except Exception, e: 76 | utils.log('sql helper create_table exception:%s' % str(e), logging.WARNING) 77 | 78 | def insert_data(self, command, data, commit = False): 79 | try: 80 | # utils.log('insert_data command:%s, data:%s' % (command, data)) 81 | 82 | self.cursor.execute(command, data) 83 | if commit: 84 | self.conn.commit() 85 | except Exception, e: 86 | utils.log('sql helper insert_data exception msg:%s' % e, logging.WARNING) 87 | 88 | def insert_json(self, data = {}, table_name = None, commit = False): 89 | try: 90 | keys = [] 91 | vals = [] 92 | for k, v in data.items(): 93 | keys.append(k) 94 | vals.append(v) 95 | val_str = ','.join(['%s'] * len(vals)) 96 | key_str = ','.join(keys) 97 | 98 | command = "INSERT IGNORE INTO {table} ({keys}) VALUES({values})". \ 99 | format(keys = key_str, values = val_str, table = table_name) 100 | # utils.log('insert_json data:%s' % data) 101 | self.cursor.execute(command, tuple(vals)) 102 | 103 | if commit: 104 | self.conn.commit() 105 | except Exception, e: 106 | utils.log('sql helper insert_json exception msg:%s' % e, logging.WARNING) 107 | 108 | def commit(self): 109 | self.conn.commit() 110 | 111 | def execute(self, command, commit = True): 112 | try: 113 | # utils.log('sql helper execute command:%s' % command) 114 | data = self.cursor.execute(command) 115 | self.conn.commit() 116 | return data 117 | except Exception, e: 118 | utils.log('sql helper execute exception msg:%s' % str(e)) 119 | return None 120 | 121 | def query(self, command, commit = False): 122 | try: 123 | utils.log('sql helper execute command:%s' % command) 124 | 125 | self.cursor.execute(command) 126 | data = self.cursor.fetchall() 127 | if commit: 128 | self.conn.commit() 129 | return data 130 | except Exception, e: 131 | utils.log('sql helper execute exception msg:%s' % str(e)) 132 | return None 133 | 134 | def query_one(self, command, commit = False): 135 | try: 136 | utils.log('sql helper execute command:%s' % command) 137 | 138 | self.cursor.execute(command) 139 | data = self.cursor.fetchone() 140 | if commit: 141 | self.conn.commit() 142 | return data 143 | except Exception, e: 144 | utils.log('sql helper execute exception msg:%s' % str(e)) 145 | return None 146 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | import re 6 | import subprocess 7 | import traceback 8 | import time 9 | import datetime 10 | 11 | 12 | # 自定义的日志输出 13 | def log(msg, level = logging.DEBUG): 14 | logging.log(level, msg) 15 | print('%s [%s], msg:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), level, msg)) 16 | 17 | if level == logging.WARNING or level == logging.ERROR: 18 | for line in traceback.format_stack(): 19 | print(line.strip()) 20 | 21 | for line in traceback.format_stack(): 22 | logging.log(level, line.strip()) 23 | 24 | 25 | # 服务器使用,清理端口占用 26 | def kill_ports(ports): 27 | for port in ports: 28 | log('kill %s start' % port) 29 | popen = subprocess.Popen('lsof -i:%s' % port, shell = True, stdout = subprocess.PIPE) 30 | (data, err) = popen.communicate() 31 | log('data:\n%s \nerr:\n%s' % (data, err)) 32 | 33 | pattern = re.compile(r'\b\d+\b', re.S) 34 | pids = re.findall(pattern, data) 35 | 36 | log('pids:%s' % str(pids)) 37 | 38 | for pid in pids: 39 | if pid != '' and pid != None: 40 | try: 41 | log('pid:%s' % pid) 42 | popen = subprocess.Popen('kill -9 %s' % pid, shell = True, stdout = subprocess.PIPE) 43 | (data, err) = popen.communicate() 44 | log('data:\n%s \nerr:\n%s' % (data, err)) 45 | except Exception, e: 46 | log('kill_ports exception:%s' % e) 47 | 48 | log('kill %s finish' % port) 49 | 50 | time.sleep(1) 51 | 52 | 53 | def make_dir(dir): 54 | log('make dir:%s' % dir) 55 | if not os.path.exists(dir): 56 | os.makedirs(dir) 57 | --------------------------------------------------------------------------------