├── .gitignore
├── LICENSE
├── config.py
├── jd
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── jd_comment.py
├── main.py
├── runspider.py
├── scrapy.cfg
├── sqlhelper.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | test
91 | *test.py
92 | migrations
93 | *.png
94 | #*.iml
95 | #*.xml
96 | .idea
97 | headers.py
98 | *.html


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 guangquan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # local
 4 | database_config = {
 5 |     'host': 'localhost',
 6 |     'port': 3306,
 7 |     'user': 'root',
 8 |     'password': '123456',
 9 |     'charset': 'utf8',
10 | }
11 | 
12 | database = 'jd'
13 | jd_item_table = 'item'
14 | 
15 | redis_pass = ''
16 | redis_host = 'localhost'
17 | redis_part = '6379'
18 | redis_db = 10
19 | 
20 | domain = 'http://127.0.0.1:8000/'
21 | 
22 | process_count = 3
23 | 


--------------------------------------------------------------------------------
/jd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awolfly9/jd_comment/3fee1067b191b61be57e423c16550db3256a494f/jd/__init__.py


--------------------------------------------------------------------------------
/jd/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JdItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/jd/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class JdPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/jd/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jd project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jd'
13 | 
14 | SPIDER_MODULES = ['jd.spiders']
15 | NEWSPIDER_MODULE = 'jd.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'jd (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | # DOWNLOAD_DELAY = 3
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | # CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'jd.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'jd.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'jd.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | # CONCURRENT_REQUESTS = 1
93 | LOG_ENABLED = True
94 | DOWNLOAD_TIMEOUT = 20


--------------------------------------------------------------------------------
/jd/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/jd/spiders/jd_comment.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import chardet
  5 | import re
  6 | import json
  7 | import datetime
  8 | import config
  9 | import utils
 10 | import redis
 11 | 
 12 | from scrapy.http.cookies import CookieJar
 13 | from scrapy.utils.project import get_project_settings
 14 | from scrapy import Spider
 15 | from scrapy import Request
 16 | from sqlhelper import SqlHelper
 17 | 
 18 | reload(sys)
 19 | sys.setdefaultencoding('utf-8')
 20 | 
 21 | 
 22 | class JDSpider(Spider):
 23 |     name = 'jd_comment'
 24 | 
 25 |     def __init__(self, name = None, **kwargs):
 26 |         super(JDSpider, self).__init__(name, **kwargs)
 27 |         self.product_id = kwargs.get('product_id', -1)
 28 |         self.log('product_id:%s' % self.product_id)
 29 |         self.item_table = 'item_%s' % self.product_id
 30 |         self.product_page = '%s_page' % self.product_id
 31 | 
 32 |         self.log_dir = 'log/%s' % self.product_id
 33 |         self.is_record_page = False
 34 |         if self.is_record_page:
 35 |             utils.make_dir(self.log_dir)
 36 | 
 37 |         self.sql = SqlHelper()
 38 |         self.red = redis.StrictRedis(host = config.redis_host, port = config.redis_part, db = config.redis_db,
 39 |                                      password = config.redis_pass)
 40 | 
 41 |     def start_requests(self):
 42 |         while self.red.llen(self.product_id) > 0:
 43 |             data = self.red.lpop(self.product_id)
 44 |             if data == None:
 45 |                 continue
 46 | 
 47 |             info = json.loads(data)
 48 |             url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \
 49 |                   '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page={page}&pageSize=10' \
 50 |                   '&isShadowSku=0'. \
 51 |                 format(product_id = self.product_id, comment_version = info.get('comment_version'),
 52 |                        sort_type = info.get('sort_type'), page = info.get('page'))
 53 | 
 54 |             yield Request(
 55 |                     url = url,
 56 |                     headers = {
 57 |                         'Accept': '*/*',
 58 |                         'Accept-Encoding': 'gzip, deflate, br',
 59 |                         'Accept-Language': 'en-US,en;q=0.5',
 60 |                         'Connection': 'keep-alive',
 61 |                         'Host': 'club.jd.com',
 62 |                         'Referer': 'https://item.jd.com/%s.html' % self.product_id,
 63 |                         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
 64 |                                       'Firefox/52.0',
 65 |                     },
 66 |                     method = 'GET',
 67 |                     meta = {
 68 |                         'page': info.get('page')
 69 |                     },
 70 |                     dont_filter = True,
 71 |                     callback = self.parse_comment
 72 |             )
 73 | 
 74 |     def parse_comment(self, response):
 75 |         self.save_page('%s_%s.html' % (self.product_id, response.meta.get('page')), response.body)
 76 | 
 77 |         detect = chardet.detect(response.body)
 78 |         encoding = detect.get('encoding', '')
 79 |         body = response.body.decode(encoding, 'ignore')
 80 | 
 81 |         pattern = re.compile('\((.*?)\);', re.S)
 82 |         item = re.search(pattern, body)
 83 |         if item != None and item.group(1) != None:
 84 |             data = json.loads(item.group(1))
 85 |             comments = data.get('comments', [])
 86 |             for comment in comments:
 87 |                 id = comment.get('id')  # 评论的 id
 88 |                 content = comment.get('content')  # 评论的内容
 89 |                 creation_time = comment.get('creationTime', '')  # 评论创建的时间
 90 |                 reply_count = comment.get('replyCount', '')  # 回复数量
 91 |                 score = comment.get('score', '')  # 评星
 92 |                 useful_vote_count = comment.get('usefulVoteCount', '')  # 其他用户觉得有用的数量
 93 |                 useless_vote_count = comment.get('uselessVoteCount', '')  # 其他用户觉得无用的数量
 94 |                 user_level_id = comment.get('userLevelId', '')  # 评论用户等级的 id
 95 |                 user_province = comment.get('userProvince', '')  # 用户的省份
 96 |                 nickname = comment.get('nickname', '')  # 评论用户的昵称
 97 |                 product_color = comment.get('productColor', '')  # 商品的颜色
 98 |                 product_size = comment.get('productSize', '')  # 商品的大小
 99 |                 user_level_name = comment.get('userLevelName', '')  # 评论用户的等级
100 |                 user_client = comment.get('userClient', '')  # 用户评价平台
101 |                 user_client_show = comment.get('userClientShow', '')  # 用户评价平台
102 |                 is_mobile = comment.get('isMobile', '')  # 是否是在移动端完成的评价
103 |                 days = comment.get('days', '')  # 购买后评论的天数
104 |                 reference_time = comment.get('referenceTime', '')  # 购买的时间
105 |                 after_days = comment.get('afterDays', '')  # 购买后再次评论的天数
106 |                 images_count = len(comment.get('images', []))  # 评论总图片的数量
107 |                 after_user_comment = comment.get('afterUserComment', '')
108 |                 if after_user_comment != '' and after_user_comment != None:
109 |                     ip = after_user_comment.get('ip', '')  # 再次评论的 ip 地址
110 | 
111 |                     h_after_user_comment = after_user_comment.get('hAfterUserComment', '')
112 |                     after_content = h_after_user_comment.get('content', '')  # 再次评论的内容
113 |                 else:
114 |                     ip = ''
115 |                     after_content = ''
116 | 
117 |                 content = content.replace('\'', '')
118 |                 after_content = after_content.replace('\'', '')
119 | 
120 |                 msg = {
121 |                     'id': id,
122 |                     'content': content,
123 |                     'creation_time': creation_time,
124 |                     'reply_count': reply_count,
125 |                     'score': score,
126 |                     'useful_vote_count': useful_vote_count,
127 |                     'useless_vote_count': useless_vote_count,
128 |                     'user_level_id': user_level_id,
129 |                     'user_province': user_province,
130 |                     'nickname': nickname,
131 |                     'product_color': product_color,
132 |                     'product_size': product_size,
133 |                     'user_level_name': user_level_name,
134 |                     'user_client': user_client,
135 |                     'user_client_show': user_client_show,
136 |                     'is_mobile': is_mobile,
137 |                     'days': days,
138 |                     'reference_time': reference_time,
139 |                     'after_days': after_days,
140 |                     'images_count': images_count,
141 |                     'ip': ip,
142 |                     'after_content': after_content,
143 |                     'save_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
144 |                 }
145 | 
146 |                 self.sql.insert_json(msg, self.item_table)
147 | 
148 |         self.sql.commit()
149 | 
150 |         # 减少 page 标识
151 |         page = self.red.get(self.product_page)
152 |         if page != None:
153 |             self.red.set(self.product_page, int(page) - 1)
154 | 
155 |     def save_page(self, filename, data):
156 |         if self.is_record_page:
157 |             with open('%s/%s' % (self.log_dir, filename), 'w') as f:
158 |                 f.write(data)
159 |                 f.close()
160 | 
161 |     def close(spider, reason):
162 |         # 事务提交数据
163 |         spider.sql.commit()
164 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import os
 5 | import logging
 6 | import requests
 7 | import redis
 8 | import config
 9 | import time
10 | import subprocess
11 | import utils
12 | 
13 | if __name__ == '__main__':
14 |     if not os.path.exists('log'):
15 |         os.makedirs('log')
16 | 
17 |     logging.basicConfig(
18 |             filename = 'log/%s.log' % 'main',
19 |             format = '%(levelname)s %(asctime)s: %(message)s',
20 |             level = logging.DEBUG
21 |     )
22 | 
23 |     url = '%sjd/register_spider' % config.domain
24 |     r = requests.get(url = url)
25 |     data = json.loads(r.text)
26 |     utils.log('register_spider data:%s' % data)
27 |     guid = data.get('guid', -1)
28 |     if guid == -1:
29 |         utils.log('register_spider ERROR not get guid')
30 |     else:
31 |         red = redis.StrictRedis(host = config.redis_host, port = config.redis_part, db = config.redis_db,
32 |                                 password = config.redis_pass)
33 |         process_list = []
34 |         product_ids = []
35 |         while True:
36 |             product_id = red.lpop(guid)
37 |             if product_id == None:
38 |                 time.sleep(0.5)
39 |                 continue
40 | 
41 |             product_ids.append(product_id)
42 |             utils.log('start crawl spider product_id:%s' % product_id)
43 |             for i in range(config.process_count):
44 |                 popen = subprocess.Popen('cd {dir};python runspider.py {param}'.format(
45 |                         dir = os.getcwd(),
46 |                         param = 'jd_comment %s' % product_id),
47 |                         shell = True)
48 |                 data = {
49 |                     'product_id': product_id,
50 |                     'popen': popen,
51 |                 }
52 | 
53 |                 process_list.append(data)
54 | 
55 |     # 删除 guid
56 |     url = '%sjd/delete_spider?guid=%s' % (config.domain, guid)
57 |     r = requests.get(url = url)
58 |     utils.log(r.text)
59 | 


--------------------------------------------------------------------------------
/runspider.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import logging
 5 | import sys
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from scrapy.utils.log import configure_logging
 9 | from scrapy.utils.project import get_project_settings
10 | 
11 | 
12 | def runspider(name, product_id):
13 |     configure_logging(install_root_handler = False)
14 |     logging.basicConfig(
15 |             filename = 'log/%s.log' % product_id,
16 |             format = '%(levelname)s %(asctime)s: %(message)s',
17 |             level = logging.DEBUG
18 |     )
19 |     process = CrawlerProcess(get_project_settings())
20 |     try:
21 |         logging.info('runscrapy start spider:%s' % name)
22 |         data = {
23 |             'product_id': product_id
24 |         }
25 |         process.crawl(name, **data)
26 |         process.start()
27 |     except Exception, e:
28 |         logging.error('runscrapy spider:%s exception:%s' % (name, e))
29 |         pass
30 | 
31 |     logging.info('finish this spider:%s\n\n' % name)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     print(sys.argv)
36 |     name = sys.argv[1] or 'jd_comment'
37 |     product_id = sys.argv[2] or '-1'
38 |     print('name:%s' % name)
39 |     print ('project dir:%s' % os.getcwd())
40 |     if product_id == -1:
41 |         print('ERROR not get product_id')
42 |     else:
43 |         runspider(name, product_id)
44 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jd.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd
12 | 


--------------------------------------------------------------------------------
/sqlhelper.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import pymysql
  5 | import utils
  6 | import config
  7 | 
  8 | 
  9 | class SqlHelper(object):
 10 |     def __init__(self):
 11 |         self.conn = pymysql.connect(**config.database_config)
 12 |         self.cursor = self.conn.cursor()
 13 | 
 14 |         try:
 15 |             self.conn.select_db(config.database)
 16 |         except:
 17 |             # self.conn.select_db(config.database)
 18 | 
 19 |             utils.log('ERROR select db error')
 20 | 
 21 |     def init(self):
 22 |         # 创建商品抓取记录表
 23 |         command = (
 24 |             "CREATE TABLE IF NOT EXISTS {} ("
 25 |             "`id` BIGINT (15) NOT NULL AUTO_INCREMENT,"  # 商品 id
 26 |             "`name` CHAR(200) NOT NULL,"  # 商品名称
 27 |             "`average_score` INT(2) DEFAULT NULL,"  # 综合评分星级
 28 |             "`good_count` INT(7) DEFAULT NULL ,"  # 好评数量
 29 |             "`good_rate` FLOAT DEFAULT NULL,"  # 好评的比例
 30 |             "`general_count` INT(4) DEFAULT NULL,"  # 中评数量
 31 |             "`general_rate` FLOAT DEFAULT NULL,"  # 中评比例
 32 |             "`poor_count` INT(4) DEFAULT NULL,"  # 差评数量
 33 |             '`poor_rate` FLOAT DEFAULT NULL,'  # 差评比例
 34 |             '`after_count` INT(5) DEFAULT NULL,'  # 追评数量
 35 |             '`good_rate_style` INT(7) DEFAULT NULL,'  #
 36 |             "`poor_rate_style` INT(5) DEFAULT NULL,"  #
 37 |             "`general_rate_style` INT(5) DEFAULT NULL,"  #
 38 |             "`comment_count` INT(7) DEFAULT NULL,"  # 总共评论数量
 39 |             "`product_id` BIGINT(15) DEFAULT NULL,"  # 商品 id
 40 |             "`good_rate_show` INT(3) DEFAULT NULL,"  # 显示的好评百分比
 41 |             "`poor_rate_show` INT(3) DEFAULT NULL,"  # 显示的差评百分比
 42 |             "`general_rate_show` INT(7) DEFAULT NULL,"  # 显示中评的百分比
 43 |             "`url` TEXT NOT NULL,"  # 网站
 44 |             "`save_time` TIMESTAMP NOT NULL,"  # 抓取数据的时间
 45 |             "PRIMARY KEY(id)"
 46 |             ") ENGINE=InnoDB".format(config.jd_item_table))
 47 |         self.create_table(command)
 48 | 
 49 |         # 创建分析商品评论结果表
 50 |         command = (
 51 |             "CREATE TABLE IF NOT EXISTS {} ("
 52 |             "`id` INT(5) NOT NULL AUTO_INCREMENT,"  # 自增 id
 53 |             "`product_id` BIGINT(15) DEFAULT NULL ,"  # 商品 id
 54 |             "`info` CHAR(255) DEFAULT NULL,"  # 分析结果的信息
 55 |             "`type` CHAR(10) DEFAULT NULL,"  # 分析结果类型
 56 |             "`guid` CHAR(40) NOT NULL,"  # guid
 57 |             "`save_time` TIMESTAMP NOT NULL,"  # 分析数据的时间
 58 |             "PRIMARY KEY(id)"
 59 |             ") ENGINE=InnoDB".format(config.analysis_item_table))
 60 |         self.create_table(command)
 61 | 
 62 |     def create_database(self):
 63 |         try:
 64 |             command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % config.database
 65 |             # utils.log('sql helper create_database command:%s' % command)
 66 |             self.cursor.execute(command)
 67 |         except Exception, e:
 68 |             utils.log('sql helper create_database exception:%s' % str(e), logging.WARNING)
 69 | 
 70 |     def create_table(self, command):
 71 |         try:
 72 |             # utils.log('sql helper create_table command:%s' % command)
 73 |             self.cursor.execute(command)
 74 |             self.conn.commit()
 75 |         except Exception, e:
 76 |             utils.log('sql helper create_table exception:%s' % str(e), logging.WARNING)
 77 | 
 78 |     def insert_data(self, command, data, commit = False):
 79 |         try:
 80 |             # utils.log('insert_data command:%s, data:%s' % (command, data))
 81 | 
 82 |             self.cursor.execute(command, data)
 83 |             if commit:
 84 |                 self.conn.commit()
 85 |         except Exception, e:
 86 |             utils.log('sql helper insert_data exception msg:%s' % e, logging.WARNING)
 87 | 
 88 |     def insert_json(self, data = {}, table_name = None, commit = False):
 89 |         try:
 90 |             keys = []
 91 |             vals = []
 92 |             for k, v in data.items():
 93 |                 keys.append(k)
 94 |                 vals.append(v)
 95 |             val_str = ','.join(['%s'] * len(vals))
 96 |             key_str = ','.join(keys)
 97 | 
 98 |             command = "INSERT IGNORE INTO {table} ({keys}) VALUES({values})". \
 99 |                 format(keys = key_str, values = val_str, table = table_name)
100 |             # utils.log('insert_json data:%s' % data)
101 |             self.cursor.execute(command, tuple(vals))
102 | 
103 |             if commit:
104 |                 self.conn.commit()
105 |         except Exception, e:
106 |             utils.log('sql helper insert_json exception msg:%s' % e, logging.WARNING)
107 | 
108 |     def commit(self):
109 |         self.conn.commit()
110 | 
111 |     def execute(self, command, commit = True):
112 |         try:
113 |             # utils.log('sql helper execute command:%s' % command)
114 |             data = self.cursor.execute(command)
115 |             self.conn.commit()
116 |             return data
117 |         except Exception, e:
118 |             utils.log('sql helper execute exception msg:%s' % str(e))
119 |             return None
120 | 
121 |     def query(self, command, commit = False):
122 |         try:
123 |             utils.log('sql helper execute command:%s' % command)
124 | 
125 |             self.cursor.execute(command)
126 |             data = self.cursor.fetchall()
127 |             if commit:
128 |                 self.conn.commit()
129 |             return data
130 |         except Exception, e:
131 |             utils.log('sql helper execute exception msg:%s' % str(e))
132 |             return None
133 | 
134 |     def query_one(self, command, commit = False):
135 |         try:
136 |             utils.log('sql helper execute command:%s' % command)
137 | 
138 |             self.cursor.execute(command)
139 |             data = self.cursor.fetchone()
140 |             if commit:
141 |                 self.conn.commit()
142 |             return data
143 |         except Exception, e:
144 |             utils.log('sql helper execute exception msg:%s' % str(e))
145 |             return None
146 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | import re
 6 | import subprocess
 7 | import traceback
 8 | import time
 9 | import datetime
10 | 
11 | 
12 | # 自定义的日志输出
13 | def log(msg, level = logging.DEBUG):
14 |     logging.log(level, msg)
15 |     print('%s [%s], msg:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), level, msg))
16 | 
17 |     if level == logging.WARNING or level == logging.ERROR:
18 |         for line in traceback.format_stack():
19 |             print(line.strip())
20 | 
21 |         for line in traceback.format_stack():
22 |             logging.log(level, line.strip())
23 | 
24 | 
25 | # 服务器使用，清理端口占用
26 | def kill_ports(ports):
27 |     for port in ports:
28 |         log('kill %s start' % port)
29 |         popen = subprocess.Popen('lsof -i:%s' % port, shell = True, stdout = subprocess.PIPE)
30 |         (data, err) = popen.communicate()
31 |         log('data:\n%s  \nerr:\n%s' % (data, err))
32 | 
33 |         pattern = re.compile(r'\b\d+\b', re.S)
34 |         pids = re.findall(pattern, data)
35 | 
36 |         log('pids:%s' % str(pids))
37 | 
38 |         for pid in pids:
39 |             if pid != '' and pid != None:
40 |                 try:
41 |                     log('pid:%s' % pid)
42 |                     popen = subprocess.Popen('kill -9 %s' % pid, shell = True, stdout = subprocess.PIPE)
43 |                     (data, err) = popen.communicate()
44 |                     log('data:\n%s  \nerr:\n%s' % (data, err))
45 |                 except Exception, e:
46 |                     log('kill_ports exception:%s' % e)
47 | 
48 |         log('kill %s finish' % port)
49 | 
50 |     time.sleep(1)
51 | 
52 | 
53 | def make_dir(dir):
54 |     log('make dir:%s' % dir)
55 |     if not os.path.exists(dir):
56 |         os.makedirs(dir)
57 | 


--------------------------------------------------------------------------------