├── tubatu
├── __init__.py
├── tubatu
│ ├── __init__.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── design_topic_spider.py
│ │ └── design_picture_spider.py
│ ├── model
│ │ ├── design_topic.py
│ │ └── design_picture.py
│ ├── reset.py
│ ├── config.py
│ ├── middlewares.py
│ ├── items.py
│ ├── constants.py
│ ├── service
│ │ ├── design_service.py
│ │ ├── design_topic_service.py
│ │ ├── image_service.py
│ │ └── design_picture_service.py
│ ├── pipelines.py
│ └── settings.py
├── run.bat
├── scrapy.cfg
└── run.py
├── guju
├── guju
│ ├── __init__.py
│ ├── model
│ │ ├── __init__.py
│ │ └── design_picture.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── design_strategy_spider.py
│ ├── pipelines.py
│ ├── items.py
│ ├── config.py
│ ├── middlewares.py
│ ├── constants.py
│ ├── run.py
│ ├── service
│ │ └── design_strategy_service.py
│ └── settings.py
└── scrapy.cfg
├── requirements.txt
├── test
├── test_anything.py
├── test_bloom_filter_service.py
├── test_proxy_pool.py
└── test_design_topic_spider.py
├── .idea
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
└── codeStyleSettings.xml
├── setup.py
├── msic
├── common
│ ├── constant.py
│ ├── utils.py
│ ├── log.py
│ └── agents.py
├── config.py
├── proxy
│ ├── proxy.py
│ ├── proxy_pool.py
│ └── proxy_strategy.py
├── core
│ └── service
│ │ ├── mongodb_service.py
│ │ └── bloom_filter_service.py
└── scrapy
│ └── middlewares.py
├── README.md
└── .gitignore
/tubatu/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/guju/guju/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/guju/guju/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tubatu/tubatu/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tubatu/run.bat:
--------------------------------------------------------------------------------
1 | py -3 run.py
2 | pause
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | selenium
3 | six
4 | pymongo
5 | pillow
6 | requests
7 | schedule
8 | beautifulsoup4
9 | redis
--------------------------------------------------------------------------------
/test/test_anything.py:
--------------------------------------------------------------------------------
1 | def foo(x, y):
2 | print(x, y)
3 |
4 |
5 | alist = [1, 2]
6 | adict = {'x': 1, 'y': 2}
7 | foo(*alist) # 1, 2
8 | foo(**adict) # 1, 2
9 |
--------------------------------------------------------------------------------
/guju/guju/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/tubatu/tubatu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/tubatu/tubatu/model/design_topic.py:
--------------------------------------------------------------------------------
1 | class DesignTopicModel(object):
2 | def __init__(self):
3 | self._id = ""
4 | self.title = ""
5 | self.description = ""
6 | self.html_url = ""
7 | self.article = {}
8 | self.create_time = ""
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/guju/guju/model/design_picture.py:
--------------------------------------------------------------------------------
1 | class DesignStrategyModel(object):
2 | def __init__(self):
3 | self.id = ""
4 | self.category = ""
5 | self.title = ""
6 | self.description = ""
7 | self.html_url = ""
8 | self.content = ""
9 | self.create_time = ""
10 |
--------------------------------------------------------------------------------
/guju/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = guju.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = guju
12 |
--------------------------------------------------------------------------------
/.idea/codeStyleSettings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/tubatu/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tubatu.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tubatu
12 |
13 |
14 |
--------------------------------------------------------------------------------
/guju/guju/pipelines.py:
--------------------------------------------------------------------------------
1 | from guju.service.design_strategy_service import DesignStrategyService
2 |
3 |
4 | class DesignStrategyPipeline(object):
5 | def __init__(self):
6 | self.design_strategy_service = DesignStrategyService()
7 |
8 | def process_item(self, item, spider):
9 | self.design_strategy_service.handle_item(item)
10 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 |
3 | setup(
4 | name='decoration-design-crawler',
5 | version='',
6 | packages=['msic', 'tubatu', 'tubatu.tubatu', 'tubatu.tubatu.spiders'],
7 | url='',
8 | license='',
9 | author='Flyn',
10 | author_email='',
11 | description='',
12 | requires=['scrapy', 'six', 'selenium'],
13 | )
14 |
--------------------------------------------------------------------------------
/msic/common/constant.py:
--------------------------------------------------------------------------------
1 | PROTOCOL_HTTPS = "https://"
2 | PROTOCOL_HTTP = "http://"
3 |
4 | HEADERS = {
5 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36',
6 | 'Connection': 'keep-alive',
7 | 'Content-Encoding': 'gzip',
8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 博客地址:
2 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(一)](http://www.jianshu.com/p/5355b467d414)
3 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(二)](http://www.jianshu.com/p/95403d6c1305)
4 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(三)](http://www.jianshu.com/p/d0462dc6a7e0)
5 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(四)](http://www.jianshu.com/p/8c5bc23f4fec)
6 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(五)](http://www.jianshu.com/p/6345dbb1ad41)
7 |
--------------------------------------------------------------------------------
/tubatu/tubatu/reset.py:
--------------------------------------------------------------------------------
1 | import shutil
2 |
3 | from config import mongodb, IMAGES_STORE
4 |
5 | from msic.config import redis_client
6 |
7 | mongodb.drop_collection("design_picture")
8 | mongodb.drop_collection("design_picture_summary")
9 | mongodb.drop_collection("design_topic")
10 |
11 | redis_client.delete('tubatu_design_topic_filter')
12 | redis_client.delete('tubatu_design_picture_filter')
13 |
14 | shutil.rmtree(IMAGES_STORE)
15 |
--------------------------------------------------------------------------------
/guju/guju/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DesignStrategyItem(scrapy.Item):
12 | title = scrapy.Field()
13 | html_url = scrapy.Field()
14 | description = scrapy.Field()
15 | content = scrapy.Field()
16 | category = scrapy.Field()
17 |
--------------------------------------------------------------------------------
/test/test_bloom_filter_service.py:
--------------------------------------------------------------------------------
1 | import redis
2 |
3 | from msic.core.service.bloom_filter_service import RedisBloomFilter
4 |
5 | REDIS_HOST = '127.0.0.1'
6 | REDIS_PORT = 6379
7 |
8 | REDIS_DATABASE_NAME = 0
9 |
10 | redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DATABASE_NAME)
11 |
12 | if __name__ == '__main__':
13 | bf = RedisBloomFilter(redis_client)
14 | print(bf.is_contains('http://xiaoguotu.to8to.com/p10482698.html', "room_design"))
15 |
--------------------------------------------------------------------------------
/msic/config.py:
--------------------------------------------------------------------------------
1 | import redis
2 |
3 | from msic.core.service import mongodb_service
4 |
5 | MONGODB_HOST = "127.0.0.1"
6 | MONGODB_PORT = 27017
7 |
8 | DATABASE_NAME = 'common'
9 | mongodb_client = mongodb_service.get_client(MONGODB_HOST, MONGODB_PORT)
10 | mongodb = mongodb_service.get_db(mongodb_client, DATABASE_NAME)
11 |
12 | REDIS_HOST = '127.0.0.1'
13 | REDIS_PORT = 6379
14 | REDIS_DATABASE_NAME = 0
15 |
16 | # Redis
17 | redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DATABASE_NAME)
18 |
--------------------------------------------------------------------------------
/guju/guju/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from requests.packages.urllib3.connectionpool import log as requests_log
4 | from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log
5 |
6 | from msic import config
7 | from msic.core.service import mongodb_service
8 |
9 | selenium_log.setLevel(logging.WARNING)
10 | requests_log.setLevel(logging.WARNING)
11 |
12 | DATABASE_NAME = "guju"
13 |
14 | # MongoDB
15 | mongodb = mongodb_service.get_db(config.mongodb_client, DATABASE_NAME)
16 |
17 | IMAGES_STORE = 'D:/scrapy'
18 |
19 | USE_PROXY = False
20 |
--------------------------------------------------------------------------------
/tubatu/tubatu/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from requests.packages.urllib3.connectionpool import log as requests_log
4 | from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log
5 |
6 | from msic import config
7 | from msic.core.service import mongodb_service
8 |
9 | selenium_log.setLevel(logging.WARNING)
10 | requests_log.setLevel(logging.WARNING)
11 |
12 | DATABASE_NAME = "tubatu"
13 |
14 | # MongoDB
15 | mongodb = mongodb_service.get_db(config.mongodb_client, DATABASE_NAME)
16 |
17 | IMAGES_STORE = 'C:/scrapy'
18 |
19 | USE_PROXY = True
20 |
--------------------------------------------------------------------------------
/guju/guju/middlewares.py:
--------------------------------------------------------------------------------
1 | from scrapy import Spider
2 |
3 |
4 | class RedirectionMiddleware(object):
5 | ERROR_COUNT = 0
6 |
7 | def process_response(self, request, response, spider: Spider):
8 | if response.status == 302 or response.status == 503:
9 | self.ERROR_COUNT += 1
10 | print('错误次数%s' % self.ERROR_COUNT)
11 | if self.ERROR_COUNT > 100:
12 | spider.close(spider, 'http status error')
13 | return response
14 |
15 | def process_exception(self, request, exception, spider):
16 | pass
17 |
--------------------------------------------------------------------------------
/tubatu/tubatu/middlewares.py:
--------------------------------------------------------------------------------
1 | from scrapy import Spider
2 |
3 |
4 | class RedirectionMiddleware(object):
5 | ERROR_COUNT = 0
6 |
7 | def process_response(self, request, response, spider: Spider):
8 | if response.status == 302 or response.status == 503:
9 | self.ERROR_COUNT += 1
10 | print('错误次数%s' % self.ERROR_COUNT)
11 | if self.ERROR_COUNT > 100:
12 | spider.close(spider, 'http status error')
13 | return response
14 |
15 | def process_exception(self, request, exception, spider):
16 | pass
17 |
--------------------------------------------------------------------------------
/guju/guju/constants.py:
--------------------------------------------------------------------------------
1 | PROJECT_NAME = "guju"
2 |
3 | ZONE_TYPE = {'19': '验房须知',
4 | '18': '装修合同',
5 | '17': '装修预算',
6 | '16': '装修风水',
7 | '15': '装修设计',
8 | '14': '装修要点',
9 | '20': '装修灵感',
10 | '13': '装修选材',
11 | '12': '建材安装',
12 | '11': '改拆工程',
13 | '10': '水电工程',
14 | '9': '防水工程',
15 | '8': '泥瓦工程',
16 | '7': '土木工程',
17 | '6': '油漆工程',
18 | '5': '装修污染',
19 | '4': '装修验收',
20 | '3': '家居护理',
21 | '2': '家居配饰',
22 | '1': '家电家私',
23 | }
24 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/msic/proxy/proxy.py:
--------------------------------------------------------------------------------
1 | from msic.common import utils
2 |
3 |
4 | class Proxy(object):
5 | def __init__(self):
6 | self.ip = ''
7 | self.response_speed = -1
8 | self.validity = False
9 | self.origin = ''
10 | self.create_time = ''
11 | self.update_time = ''
12 | self.failed_count = 0
13 |
14 | @staticmethod
15 | def create(ip, origin):
16 | proxy = Proxy()
17 | proxy.ip = ip
18 | proxy.origin = origin
19 | proxy.create_time = utils.get_utc_time()
20 | proxy.update_time = proxy.create_time
21 | proxy.failed_count = 0
22 | proxy.response_speed = -1
23 | proxy.validity = False
24 | return proxy
25 |
--------------------------------------------------------------------------------
/tubatu/tubatu/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DesignPictureItem(scrapy.Item):
12 | fid = scrapy.Field()
13 | title = scrapy.Field()
14 | sub_title = scrapy.Field()
15 | html_url = scrapy.Field()
16 | tags = scrapy.Field()
17 | description = scrapy.Field()
18 | img_url = scrapy.Field()
19 | img_width = scrapy.Field()
20 | img_height = scrapy.Field()
21 | img_name = scrapy.Field()
22 |
23 |
24 | class DesignTopicItem(scrapy.Item):
25 | title = scrapy.Field()
26 | description = scrapy.Field()
27 | html_url = scrapy.Field()
28 | article = scrapy.Field()
29 | create_time = scrapy.Field()
30 |
--------------------------------------------------------------------------------
/tubatu/tubatu/model/design_picture.py:
--------------------------------------------------------------------------------
1 | class DesignPictureModel(object):
2 | def __init__(self):
3 | self.id = ""
4 | self.fid = ""
5 | self.title = ""
6 | self.sub_title = ""
7 | self.html_url = ""
8 | self.tags = []
9 | self.description = ""
10 | self.img_url = ""
11 | self.img_width = 0
12 | self.img_height = 0
13 | self.img_name = "" # /tubatu/2016-09-01/ff5e6d6e5abafbaeb56af2b5034d83e9
14 | self.create_time = ""
15 |
16 |
17 | class DesignPictureSummaryModel(object):
18 | def __init__(self):
19 | self.id = ""
20 | self.cid = []
21 | self.title = ""
22 | self.description = ""
23 | self.tags = []
24 | self.html_url = ""
25 | self.create_time = ""
26 | self.update_time = ""
27 | self.cover_img_url = ""
28 | self.cover_img_width = 0
29 | self.cover_img_height = 0
30 | self.cover_img_name = ""
31 |
--------------------------------------------------------------------------------
/tubatu/tubatu/constants.py:
--------------------------------------------------------------------------------
1 | PROJECT_NAME = "tubatu"
2 |
3 | ZONE_TYPE = {'1': '客厅', '2': '卧室', '3': '餐厅', '4': '厨房', '5': '卫生间', '6': '阳台', '7': '书房', '8': '玄关', '10': '儿童房', '11': '衣帽间', '12': '花园'}
4 | STYLE_ID = {'13': '简约', '15': '现代', '4': '中式', '2': '欧式', '9': '美式', '11': '田园', '6': '新古典', '0': '混搭', '12': '地中海', '8': '东南亚', '17': '日式',
5 | '18': '宜家',
6 | '19': '北欧', '20': '简欧'}
7 | COLOR_ID = {'1': '白色', '2': '黑色', '3': '红色', '4': '黑色', '5': '绿色', '6': '橙色', '7': '粉色', '8': '蓝色', '9': '灰色', '10': '紫色', '11': '棕色', '12': '米色',
8 | '13': '彩色', '14': '原木色'}
9 | PART_ID = {'336': '背景墙', '16': '吊顶', '14': '隔断', '9': '窗帘', '340': '飘窗', '33': '榻榻米', '17': '橱柜', '343': '博古架', '333': '阁楼', '249': '隐形门', '21': '吧台',
10 | '22': '酒柜', '23': '鞋柜', '24': '衣柜', '19': '窗户', '20': '相片墙', '18': '楼梯', '359': '其他'}
11 | AREA = {'1': '60㎡以下', '2': '60-80㎡', '3': '80-100㎡', '4': '100-120㎡', '5': '120-150㎡', '6': '150㎡以上'}
12 | HX_ID = {'1': '小户型', '7': '一居', '2': '二居', '3': '三居', '4': '四居', '5': '复式', '6': '别墅', '8': '公寓', '9': 'loft'}
13 |
--------------------------------------------------------------------------------
/test/test_proxy_pool.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from msic.proxy.proxy_pool import ProxyPool
4 |
5 |
6 | class TestProxyPool(unittest.TestCase):
7 | def setUp(self):
8 | self.proxy_pool = ProxyPool()
9 |
10 | def test_random_choice_proxy(self):
11 | ip = self.proxy_pool.random_choice_proxy()
12 | assert ip is not None
13 | assert not ip.strip() == ''
14 | print(ip)
15 |
16 | def test_add_failed_time(self):
17 | ip = self.proxy_pool.random_choice_proxy()
18 | # ip = '211.65.37.125:8118'
19 | self.proxy_pool.add_failed_time(ip)
20 | proxy = self.proxy_pool.collection.find_one({'ip': ip})
21 | print(proxy)
22 | print("失败次数:%s" % proxy['failed_count'])
23 |
24 | def test_check_ip_availability_task(self):
25 | self.proxy_pool.check_ip_availability_task()
26 |
27 | def test_crawl_proxy_task(self):
28 | self.proxy_pool.crawl_proxy_task()
29 |
30 | def test_start(self):
31 | self.proxy_pool.start()
32 |
33 |
34 | if __name__ == '__main__':
35 | unittest.main()
36 |
--------------------------------------------------------------------------------
/msic/core/service/mongodb_service.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient, errors
2 | from pymongo.collection import Collection
3 | from pymongo.database import Database
4 |
5 | from msic.common import log
6 |
7 | MAX_POOL_SIZE = 5
8 |
9 |
10 | def get_client(host: str, port: int) -> MongoClient:
11 | try:
12 | client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE)
13 | log.info("Connected successfully!!!")
14 | return client
15 | except errors.ConnectionFailure as e:
16 | log.error(e)
17 |
18 |
19 | def get_db(client: MongoClient, db_name: str) -> Database:
20 | try:
21 | db = Database(client, db_name)
22 | return db
23 | except Exception as e:
24 | log.error(e)
25 |
26 |
27 | def get_collection(db: Database, name: str) -> Collection:
28 | collection = Collection(db, name)
29 | return collection
30 |
31 |
32 | def insert(collection: Collection, data):
33 | collection.insert_one(data)
34 |
35 |
36 | if __name__ == '__main__':
37 | mongo_client = get_client(MongoClient.HOST, MongoClient.PORT)
38 | db = get_db(mongo_client, "test")
39 | collection = get_collection(db, "test1")
40 | insert(collection, {"test": "helloworld"})
41 |
--------------------------------------------------------------------------------
/msic/common/utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import hashlib
3 | import os
4 | import uuid
5 |
6 | import requests
7 | from requests import Response
8 | from requests.adapters import HTTPAdapter
9 |
10 | from msic.common.constant import HEADERS
11 |
12 |
13 | # 2a47d8b6-6f5b-11e6-ac9d-64006a0b51ab
14 | def get_uuid() -> str:
15 | return str(uuid.uuid1())
16 |
17 |
18 | # 2016-08-31T09:13:22.434Z
19 | def get_utc_time() -> str:
20 | return datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
21 |
22 |
23 | def get_md5(content: str) -> str:
24 | md5 = hashlib.md5()
25 | md5.update(content.encode('utf-8'))
26 | return md5.hexdigest()
27 |
28 |
29 | def make_dirs(path: str):
30 | if not os.path.exists(path):
31 | os.makedirs(path, exist_ok=True)
32 |
33 |
34 | def http_request(url: str, timeout=30) -> Response:
35 | session = requests.Session()
36 | session.mount('https://', HTTPAdapter(max_retries=5))
37 | session.mount('http://', HTTPAdapter(max_retries=5))
38 | response = session.get(url, headers=HEADERS, timeout=timeout)
39 | return response
40 |
41 |
42 | def log(content: str):
43 | print("============================= {content} ==========================".format(content=(get_utc_time() + " " + content)))
44 |
--------------------------------------------------------------------------------
/msic/common/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from os.path import dirname
4 |
5 | SAVE_PATH = dirname(dirname(dirname(__file__)))
6 |
7 | logger = logging.getLogger()
8 | formatter = logging.Formatter('\n%(asctime)s - %(name)s - %(levelname)s \n%(message)s')
9 |
10 | error_handler = logging.FileHandler(SAVE_PATH + '/error.log', encoding='utf-8')
11 | error_handler.setLevel(logging.ERROR)
12 | error_handler.setFormatter(formatter)
13 | logger.addHandler(error_handler)
14 |
15 | warn_handler = logging.FileHandler(SAVE_PATH + '/warn.log', encoding='utf-8')
16 | warn_handler.setLevel(logging.WARNING)
17 | warn_handler.setFormatter(formatter)
18 | logger.addHandler(warn_handler)
19 |
20 |
21 | def handle_exception(exc_type, exc_value, exc_traceback):
22 | if issubclass(exc_type, KeyboardInterrupt):
23 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
24 | return
25 | logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
26 |
27 |
28 | sys.excepthook = handle_exception
29 |
30 |
31 | def warn(msg):
32 | logger.warning(msg)
33 |
34 |
35 | def info(msg):
36 | logger.info(msg)
37 |
38 |
39 | def debug(msg):
40 | logger.debug(msg)
41 |
42 |
43 | def error(e: Exception):
44 | logger.error("Exception %s" % e)
45 |
--------------------------------------------------------------------------------
/msic/core/service/bloom_filter_service.py:
--------------------------------------------------------------------------------
1 | from redis import StrictRedis
2 |
3 |
4 | class SimpleHash(object):
5 | def __init__(self, cap, seed):
6 | self.cap = cap
7 | self.seed = seed
8 |
9 | def hash(self, value):
10 | ret = 0
11 | for i in range(value.__len__()):
12 | ret += self.seed * ret + ord(value[i])
13 | return (self.cap - 1) & ret
14 |
15 |
16 | class RedisBloomFilter(object):
17 | def __init__(self, redis_client: StrictRedis):
18 | self.bit_size = 1 << 25
19 | self.seeds = [5, 7, 11, 13, 31, 37, 61]
20 | self.redis = redis_client
21 | self.hash_dict = []
22 | for i in range(self.seeds.__len__()):
23 | self.hash_dict.append(SimpleHash(self.bit_size, self.seeds[i]))
24 |
25 | def is_contains(self, value, key):
26 | if value is None:
27 | return False
28 | if value.__len__() == 0:
29 | return False
30 | ret = True
31 | for f in self.hash_dict:
32 | loc = f.hash(value)
33 | ret = ret & self.redis.getbit(key, loc)
34 | return ret
35 |
36 | def insert(self, value, key):
37 | for f in self.hash_dict:
38 | loc = f.hash(value)
39 | self.redis.setbit(key, loc, 1)
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/tubatu/tubatu/service/design_service.py:
--------------------------------------------------------------------------------
1 | from msic.common import log
2 | from msic.config import redis_client
3 | from msic.core.service import mongodb_service
4 | from msic.core.service.bloom_filter_service import RedisBloomFilter
5 | from tubatu import config
6 |
7 |
8 | class DesignService(object):
9 | TABLE_NAME = ''
10 | REDIS_KEY = ''
11 |
12 | def __init__(self):
13 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME)
14 | self.redis_bloom_filter = RedisBloomFilter(redis_client)
15 |
16 | def get_model(self, design_item):
17 | pass
18 |
19 | def save_to_database(self, collection, item):
20 | try:
21 | mongodb_service.insert(collection, item.__dict__)
22 | except Exception as e:
23 | log.error(e)
24 |
25 | def find_one(self, collection, condition: dict):
26 | try:
27 | return collection.find_one(condition)
28 | except Exception as e:
29 | log.error(e)
30 |
31 | def update_one(self, collection, condition: dict, value: dict):
32 | try:
33 | return collection.update_one(condition, {"$set": value})
34 | except Exception as e:
35 | log.error(e)
36 |
37 | def is_duplicate_url(self, value: str) -> bool:
38 | return self.redis_bloom_filter.is_contains(value, self.REDIS_KEY)
39 |
40 | def insert_to_redis(self, value: str):
41 | self.redis_bloom_filter.insert(value, self.REDIS_KEY)
42 |
43 | def handle_item(self, design_item):
44 | pass
45 |
--------------------------------------------------------------------------------
/tubatu/tubatu/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from tubatu.service.design_picture_service import DesignPictureService
9 | from tubatu.service.design_topic_service import DesignTopicService
10 | from tubatu.service.image_service import ImageService
11 |
12 |
13 | class DesignPicturePipeline(object):
14 | def __init__(self):
15 | self.design_picture_service = DesignPictureService()
16 |
17 | def process_item(self, item, spider):
18 | img_url = item['img_url']
19 | img_name = ImageService.generate_name(img_url)
20 | file_path = ImageService.file_path(img_name)
21 | thumb_path = ImageService.thumb_path(img_name)
22 | ImageService.download_img(img_url, file_path)
23 | ImageService.save_thumbnail(file_path, thumb_path)
24 | item['img_name'] = img_name
25 | self.design_picture_service.handle_item(item)
26 |
27 |
28 | class DesignTopicPipeline(object):
29 | def __init__(self):
30 | self.design_topic_service = DesignTopicService()
31 |
32 | def process_item(self, item, spider):
33 | article = item['article']
34 | for part in article:
35 | img_url = part['img_url']
36 | img_name = ImageService.generate_name(img_url)
37 | file_path = ImageService.file_path(img_name)
38 | thumb_path = ImageService.thumb_path(img_name)
39 | ImageService.download_img(img_url, file_path)
40 | ImageService.save_thumbnail(file_path, thumb_path)
41 | part['img_name'] = img_name
42 | self.design_topic_service.handle_item(item)
43 |
--------------------------------------------------------------------------------
/tubatu/tubatu/service/design_topic_service.py:
--------------------------------------------------------------------------------
1 | from tubatu.items import DesignTopicItem
2 | from tubatu.model.design_topic import DesignTopicModel
3 | from tubatu.service.design_service import DesignService
4 |
5 | from msic.common import log
6 | from msic.common import utils
7 |
8 |
9 | class DesignTopicService(DesignService):
10 | TABLE_NAME = "design_topic"
11 | REDIS_KEY = "tubatu_design_topic_filter"
12 |
13 | def __init__(self):
14 | super(DesignTopicService, self).__init__()
15 |
16 | def get_model(self, design_topic_item: DesignTopicItem) -> DesignTopicModel:
17 | design_topic_model = DesignTopicModel()
18 | design_topic_model._id = utils.get_uuid()
19 | design_topic_model.title = design_topic_item['title']
20 | design_topic_model.description = design_topic_item['description']
21 | design_topic_model.html_url = design_topic_item['html_url']
22 | design_topic_model.article = design_topic_item['article']
23 | design_topic_model.create_time = utils.get_utc_time()
24 | return design_topic_model
25 |
26 | def handle_item(self, design_topic_item: DesignTopicItem):
27 | if self.is_duplicate_url(design_topic_item['html_url']):
28 | return
29 | design_topic_model = self.get_model(design_topic_item)
30 | self.save_to_database(self.collection, design_topic_model)
31 | self.insert_to_redis(design_topic_model.html_url)
32 |
33 | log.info("=========================================================================================")
34 | log.info("html_url:" + design_topic_item['html_url'])
35 | log.info("title:" + design_topic_item['title'])
36 | log.info("description:" + design_topic_item['description'])
37 | log.info("=========================================================================================")
38 |
--------------------------------------------------------------------------------
/guju/guju/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import threading
4 | import time
5 | from os.path import dirname
6 |
7 | from guju.spiders.design_strategy_spider import DesignStrategySpider
8 | from schedule import Scheduler
9 | from twisted.internet import reactor
10 |
11 | from guju import config
12 |
13 | path = dirname(os.path.abspath(os.path.dirname(__file__)))
14 | sys.path.append(path)
15 |
16 | from scrapy.crawler import CrawlerProcess
17 | from scrapy.utils.project import get_project_settings
18 | from scrapy import signals
19 | from pydispatch import dispatcher
20 |
21 |
22 | class Runner(object):
23 | def __init__(self):
24 | self.is_running = False
25 | dispatcher.connect(self.pause_crawler, signals.engine_stopped)
26 | self.setting = get_project_settings()
27 | self.process = None
28 |
29 | def start_scrapy(self):
30 | self.process = CrawlerProcess(self.setting)
31 | self.crawl()
32 | reactor.run()
33 |
34 | def pause_crawler(self):
35 | self.is_running = False
36 | print("============ 爬虫已停止 ===================")
37 |
38 | def crawl(self):
39 | self.is_running = True
40 | self.process.crawl(DesignStrategySpider())
41 |
42 | def start_proxy_pool(self):
43 | from msic.proxy.proxy_pool import proxy_pool
44 | if config.USE_PROXY:
45 | proxy_pool.start()
46 | else:
47 | proxy_pool.drop_proxy()
48 |
49 | def run(self):
50 | self.start_proxy_pool()
51 | self.start_scrapy()
52 |
53 |
54 | if __name__ == '__main__':
55 | runner = Runner()
56 |
57 |
58 | def thread_task():
59 | def task():
60 | if not runner.is_running:
61 | print("============ 开始重新爬取 ===================")
62 | runner.crawl()
63 |
64 | schedule = Scheduler()
65 | schedule.every(30).minutes.do(task)
66 |
67 | while True:
68 | schedule.run_pending()
69 | time.sleep(1)
70 |
71 |
72 | thread = threading.Thread(target=thread_task)
73 | thread.start()
74 |
75 | runner.run()
76 |
--------------------------------------------------------------------------------
/tubatu/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import threading
4 | import time
5 | from os.path import dirname
6 |
7 | from schedule import Scheduler
8 | from twisted.internet import reactor
9 |
10 | from tubatu import config
11 |
12 | path = dirname(os.path.abspath(os.path.dirname(__file__)))
13 | sys.path.append(path)
14 |
15 | from scrapy.crawler import CrawlerProcess
16 | from scrapy.utils.project import get_project_settings
17 | from scrapy import signals
18 | from pydispatch import dispatcher
19 | from tubatu.spiders.design_picture_spider import DesignPictureSpider
20 | from tubatu.spiders.design_topic_spider import DesignTopicSpider
21 |
22 |
23 | class Runner(object):
24 | def __init__(self):
25 | self.is_running = False
26 | dispatcher.connect(self.pause_crawler, signals.engine_stopped)
27 | self.setting = get_project_settings()
28 | self.process = None
29 |
30 | def start_scrapy(self):
31 | self.process = CrawlerProcess(self.setting)
32 | self.crawl()
33 | reactor.run()
34 |
35 | def pause_crawler(self):
36 | self.is_running = False
37 | print("============ 爬虫已停止 ===================")
38 |
39 | def crawl(self):
40 | self.is_running = True
41 | self.process.crawl(DesignPictureSpider)
42 | self.process.crawl(DesignTopicSpider)
43 |
44 | def start_proxy_pool(self):
45 | from msic.proxy.proxy_pool import proxy_pool
46 | if config.USE_PROXY:
47 | proxy_pool.start()
48 | else:
49 | proxy_pool.drop_proxy()
50 |
51 | def run(self):
52 | self.start_proxy_pool()
53 | self.start_scrapy()
54 |
55 |
56 | if __name__ == '__main__':
57 | runner = Runner()
58 |
59 |
60 | def thread_task():
61 | def task():
62 | if not runner.is_running:
63 | print("============ 开始重新爬取 ===================")
64 | runner.crawl()
65 |
66 | schedule = Scheduler()
67 | schedule.every(30).minutes.do(task)
68 |
69 | while True:
70 | schedule.run_pending()
71 | time.sleep(1)
72 |
73 |
74 | thread = threading.Thread(target=thread_task)
75 | thread.start()
76 |
77 | runner.run()
78 |
--------------------------------------------------------------------------------
/test/test_design_topic_spider.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 | from os.path import dirname
5 |
6 | import requests
7 | from scrapy import Selector
8 | from scrapy.http import Response
9 |
10 | from tubatu.tubatu.items import DesignTopicItem
11 |
12 | path = dirname(os.path.abspath(os.path.dirname(__file__)))
13 | sys.path.append(path)
14 |
15 |
16 | class TestDesignTopicSpider(unittest.TestCase):
17 | def test_parse_content(self):
18 | content = requests.get('http://xiaoguotu.to8to.com/topic/11.html')
19 | response = Response('http://xiaoguotu.to8to.com/topic/11.html')
20 | response.text = content.content.decode("utf-8")
21 | selector = Selector(response)
22 | title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
23 | description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
24 | items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
25 | article = []
26 | text = ''
27 | for index, item_selector in enumerate(items_selector):
28 | try:
29 | text = item_selector.xpath('span/text()').extract()[0]
30 | except IndexError:
31 | try:
32 | img_url = item_selector.xpath('img/@src').extract()[0]
33 | img_width = 0
34 | try:
35 | img_width = item_selector.xpath('img/@width').extract()[0]
36 | except IndexError:
37 | pass
38 | img_height = 0
39 | try:
40 | img_height = item_selector.xpath('img/@height').extract()[0]
41 | except IndexError:
42 | pass
43 | article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
44 | except IndexError:
45 | continue
46 | design_topic_item = DesignTopicItem()
47 | design_topic_item['title'] = title
48 | design_topic_item['description'] = description
49 | design_topic_item['article'] = article
50 | design_topic_item['html_url'] = response.url
51 | return design_topic_item
52 |
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
56 |
--------------------------------------------------------------------------------
/tubatu/tubatu/service/image_service.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from PIL import Image
3 | from tubatu.constants import PROJECT_NAME
4 |
5 | from msic.common import utils
6 | from msic.proxy.proxy_pool import proxy_pool
7 | from tubatu import config
8 |
9 | IMAGE_SIZE = 500, 500
10 |
11 |
12 | class ImageService(object):
13 | @staticmethod
14 | def generate_name(key):
15 | create_time = utils.get_utc_time()
16 | img_name = "/" + PROJECT_NAME + "/" + create_time[0:10] + "/" + utils.get_md5(create_time + key)
17 | return img_name
18 |
19 | @staticmethod
20 | def get_file_name(image_name) -> str:
21 | name_data = image_name[1:].split("/")
22 | project_name = name_data[0]
23 | date = name_data[1]
24 | file_name = name_data[2]
25 | return "/" + project_name + "/" + date + "/" + file_name
26 |
27 | @staticmethod
28 | def file_path(image_name):
29 | file_path = ImageService.get_file_name(image_name)
30 | dir_name = file_path[0:file_path.rfind("/")]
31 | utils.make_dirs(config.IMAGES_STORE + dir_name)
32 | path = config.IMAGES_STORE + '%s_original.jpg' % file_path
33 | return path
34 |
35 | @staticmethod
36 | def thumb_path(image_name):
37 | file_path = ImageService.get_file_name(image_name)
38 | dir_name = file_path[0:file_path.rfind("/")]
39 | utils.make_dirs(config.IMAGES_STORE + dir_name)
40 | path = config.IMAGES_STORE + '%s_thumb.jpg' % file_path
41 | return path
42 |
43 | @staticmethod
44 | def download_img(img_url, file_path):
45 | proxies = None
46 | proxy = ''
47 | if config.USE_PROXY:
48 | proxy = proxy_pool.random_choice_proxy()
49 | proxies = {
50 | 'http': "http://%s" % proxy,
51 | }
52 | try:
53 | response = requests.get(img_url, stream=True, proxies=proxies)
54 | if response.status_code == 200:
55 | with open(file_path, 'wb') as f:
56 | for chunk in response.iter_content(1024):
57 | f.write(chunk)
58 | else:
59 | if config.USE_PROXY:
60 | proxy_pool.add_failed_time(proxy)
61 | except:
62 | if config.USE_PROXY:
63 | proxy_pool.add_failed_time(proxy)
64 |
65 | @staticmethod
66 | def save_thumbnail(file_path, thumb_path):
67 | image = Image.open(file_path)
68 | if thumb_path is not None:
69 | image.thumbnail(IMAGE_SIZE)
70 | image.save(thumb_path)
71 | del image
72 |
--------------------------------------------------------------------------------
/msic/scrapy/middlewares.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | from scrapy.http import HtmlResponse
4 | from selenium import webdriver
5 | from selenium.webdriver import DesiredCapabilities
6 |
7 | from msic.common import log, agents
8 | from msic.proxy.proxy_pool import proxy_pool
9 |
10 | JAVASCRIPT = 'JAVASCRIPT'
11 |
12 |
13 | class CatchExceptionMiddleware(object):
14 | def process_response(self, request, response, spider):
15 | if response.status < 200 or response.status >= 400:
16 | try:
17 | proxy_pool.add_failed_time(request.meta['proxy'].replace('http://', ''))
18 | except KeyError:
19 | pass
20 | return response
21 |
22 | def process_exception(self, request, exception, spider):
23 | try:
24 | proxy_pool.add_failed_time(request.meta['proxy'].replace('http://', ''))
25 | except Exception:
26 | pass
27 |
28 |
29 | class CustomHttpProxyMiddleware(object):
30 | def process_request(self, request, spider):
31 | try:
32 | request.meta['proxy'] = "http://%s" % proxy_pool.random_choice_proxy()
33 | except Exception as e:
34 | log.error(e)
35 |
36 |
37 | class CustomUserAgentMiddleware(object):
38 | def process_request(self, request, spider):
39 | agent = random.choice(agents.AGENTS_ALL)
40 | request.headers['User-Agent'] = agent
41 |
42 |
43 | class JavaScriptMiddleware(object):
44 | def process_request(self, request, spider):
45 | if JAVASCRIPT in request.meta and request.meta[JAVASCRIPT] is True:
46 | driver = self.phantomjs_opened()
47 | try:
48 | driver.get(request.url)
49 | body = driver.page_source
50 | return HtmlResponse(request.url, body=body, encoding='utf-8', request=request)
51 | finally:
52 | self.phantomjs_closed(driver)
53 |
54 | def phantomjs_opened(self):
55 | capabilities = DesiredCapabilities.PHANTOMJS.copy()
56 | proxy = proxy_pool.random_choice_proxy()
57 | capabilities['proxy'] = {
58 | 'proxyType': 'MANUAL',
59 | 'ftpProxy': proxy,
60 | 'sslProxy': proxy,
61 | 'httpProxy': proxy,
62 | 'noProxy': None
63 | }
64 | # capabilities['phantomjs.cli.args'] = [
65 | # '--proxy-auth=' + evar.get('WONDERPROXY_USER') + ':' + evar.get('WONDERPROXY_PASS')
66 | # ]
67 | driver = webdriver.PhantomJS(desired_capabilities=capabilities)
68 | driver.set_page_load_timeout(120)
69 | return driver
70 |
71 | def phantomjs_closed(self, driver):
72 | driver.quit()
73 |
--------------------------------------------------------------------------------
/guju/guju/service/design_strategy_service.py:
--------------------------------------------------------------------------------
1 | from guju.items import DesignStrategyItem
2 | from guju.model.design_picture import DesignStrategyModel
3 |
4 | from guju import config
5 | from msic.common import log
6 | from msic.common import utils
7 | from msic.config import redis_client
8 | from msic.core.service import mongodb_service
9 | from msic.core.service.bloom_filter_service import RedisBloomFilter
10 |
11 |
12 | class DesignStrategyService(object):
13 | TABLE_NAME = "design_strategy"
14 | REDIS_KEY = "guju_design_strategy_filter"
15 |
16 | def __init__(self):
17 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME)
18 | self.redis_bloom_filter = RedisBloomFilter(redis_client)
19 |
20 | def is_duplicate_url(self, value: str) -> bool:
21 | return self.redis_bloom_filter.is_contains(value, self.REDIS_KEY)
22 |
23 | def insert_to_redis(self, value: str):
24 | self.redis_bloom_filter.insert(value, self.REDIS_KEY)
25 |
26 | def save_to_database(self, collection, item):
27 | try:
28 | mongodb_service.insert(collection, item.__dict__)
29 | except Exception as e:
30 | log.error(e)
31 |
32 | def handle_item(self, design_strategy_item: DesignStrategyItem):
33 | if self.is_duplicate_url(design_strategy_item['html_url']):
34 | return
35 | design_strategy_model = self.get_design_strategy_model(design_strategy_item)
36 | self.save_to_database(self.collection, design_strategy_model)
37 | self.insert_to_redis(design_strategy_model.html_url)
38 | log.info("=========================================================================================")
39 | log.info("title:" + design_strategy_item['title'])
40 | log.info("description:" + design_strategy_item['description'])
41 | log.info("category:" + design_strategy_item['category'])
42 | log.info("html_url:" + design_strategy_item['html_url'])
43 | log.info("=========================================================================================")
44 |
45 | def get_design_strategy_model(self, design_strategy_item: DesignStrategyItem) -> DesignStrategyModel:
46 | design_strategy_model = DesignStrategyModel()
47 | design_strategy_model.id = utils.get_uuid()
48 | design_strategy_model.title = design_strategy_item['title']
49 | design_strategy_model.html_url = design_strategy_item['html_url']
50 | design_strategy_model.description = design_strategy_item['description']
51 | design_strategy_model.content = design_strategy_item['content']
52 | design_strategy_model.category = design_strategy_item['category']
53 | design_strategy_model.create_time = utils.get_utc_time()
54 | return design_strategy_model
55 |
--------------------------------------------------------------------------------
/guju/guju/spiders/design_strategy_spider.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import scrapy
4 | from guju.items import DesignStrategyItem
5 | from guju.service.design_strategy_service import DesignStrategyService
6 | from scrapy.linkextractors import LinkExtractor
7 | from scrapy.selector import Selector
8 | from scrapy.spiders import CrawlSpider
9 | from scrapy.spiders import Rule
10 |
11 | from guju import config
12 | from msic.common import constant
13 | from msic.common import log
14 | from msic.proxy.proxy_pool import proxy_pool
15 |
16 |
17 | class DesignStrategySpider(CrawlSpider):
18 | start_url_domain = 'guju.com.cn'
19 | name = 'design_strategy'
20 | allowed_domains = ['guju.com.cn']
21 | start_urls = ['http://guju.com.cn/strategy/new']
22 | rules = (
23 | Rule(LinkExtractor(allow="/strategy/new/p-\d+"), follow=True, callback='parse_list'),
24 | )
25 | custom_settings = {
26 | 'ITEM_PIPELINES': {
27 | 'guju.pipelines.DesignStrategyPipeline': 302,
28 | }
29 | }
30 | design_strategy_service = DesignStrategyService()
31 |
32 | def parse_list(self, response):
33 | selector = Selector(response)
34 | items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]')
35 | for item_selector in items_selector:
36 | id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '')
37 | # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498
38 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format(
39 | id=id)
40 | if self.design_strategy_service.is_duplicate_url(next_url):
41 | log.info("=================过滤了" + next_url + "===========")
42 | continue
43 | yield scrapy.Request(next_url, self.parse_content, meta={'id': id})
44 |
45 | def parse_content(self, response):
46 | try:
47 | data = json.loads(response.text)
48 | except:
49 | print("-----------------------获取到json:" + response.text + "------------------------------")
50 | return
51 | try:
52 | model = data['strategyModel']
53 | category = model['categoryName']
54 | title = model['title']
55 | description = model['description']
56 | content = model['context']
57 |
58 | design_strategy_item = DesignStrategyItem() # type: DesignStrategyItem
59 | design_strategy_item['category'] = category
60 | design_strategy_item['title'] = title
61 | design_strategy_item['description'] = description
62 | design_strategy_item['content'] = content
63 | design_strategy_item['html_url'] = response.url
64 | yield design_strategy_item
65 | except Exception as e:
66 | print("-----------------------获取到json:" + response.text + "------------------------------")
67 | log.warn("%s ( refer: %s )" % (e, response.url))
68 | if config.USE_PROXY:
69 | proxy_pool.add_failed_time(response.meta['proxy'].replace('http://', ''))
70 |
--------------------------------------------------------------------------------
/tubatu/tubatu/spiders/design_topic_spider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy.linkextractors import LinkExtractor
3 | from scrapy.selector import Selector
4 | from scrapy.spiders import CrawlSpider
5 | from scrapy.spiders import Rule
6 | from tubatu.items import DesignTopicItem
7 | from tubatu.service.design_topic_service import DesignTopicService
8 |
9 | from msic.common import constant
10 |
11 |
12 | class DesignTopicSpider(CrawlSpider):
13 | start_url_domain = 'xiaoguotu.to8to.com'
14 | name = 'design_topic'
15 | allowed_domains = ['to8to.com']
16 | start_urls = ['http://xiaoguotu.to8to.com/topic/']
17 | rules = (
18 | Rule(LinkExtractor(allow='/topic/p_\d+.html'), follow=True, callback='parse_list', process_links='process_links'),
19 | )
20 | custom_settings = {
21 | 'ITEM_PIPELINES': {
22 | 'tubatu.pipelines.DesignTopicPipeline': 301,
23 | }
24 | }
25 | design_topic_service = DesignTopicService()
26 |
27 | def process_links(self, links):
28 | for link in links:
29 | link.url = link.url.replace('%20', '')
30 | return links
31 |
32 | def parse_list(self, response):
33 | selector = Selector(response)
34 | items_selector = selector.xpath('//div[@class="xgt_topic"]')
35 | for item_selector in items_selector:
36 | # /topic/7334.html
37 | href = item_selector.xpath('div//a/@href').extract()[0]
38 | href = href.strip()
39 | # http://xiaoguotu.to8to.com/topic/7334.html
40 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + href)
41 | if self.design_topic_service.is_duplicate_url(next_url):
42 | continue
43 | yield scrapy.Request(next_url, self.parse_content)
44 |
45 | def parse_content(self, response):
46 | selector = Selector(response)
47 | title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
48 | description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
49 | items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
50 | article = []
51 | text = ''
52 | for index, item_selector in enumerate(items_selector):
53 | try:
54 | text = item_selector.xpath('span/text()').extract()[0]
55 | except IndexError:
56 | try:
57 | img_url = item_selector.xpath('img/@src').extract()[0]
58 | img_width = 0
59 | try:
60 | img_width = item_selector.xpath('img/@width').extract()[0]
61 | except IndexError:
62 | pass
63 | img_height = 0
64 | try:
65 | img_height = item_selector.xpath('img/@height').extract()[0]
66 | except IndexError:
67 | pass
68 | article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
69 | except IndexError:
70 | continue
71 | design_topic_item = DesignTopicItem()
72 | design_topic_item['title'] = title
73 | design_topic_item['description'] = description
74 | design_topic_item['article'] = article
75 | design_topic_item['html_url'] = response.url
76 | return design_topic_item
77 |
--------------------------------------------------------------------------------
/guju/guju/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for guju project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | import os
12 | import sys
13 | from os.path import dirname
14 |
15 | from guju.config import USE_PROXY
16 |
17 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
18 | sys.path.append(path)
19 |
20 | BOT_NAME = 'guju'
21 |
22 | SPIDER_MODULES = ['guju.spiders']
23 | NEWSPIDER_MODULE = 'guju.spiders'
24 |
25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
26 | # USER_AGENT = 'guju (+http://www.yourdomain.com)'
27 |
28 | # Obey robots.txt rules
29 | ROBOTSTXT_OBEY = False
30 |
31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
32 | CONCURRENT_REQUESTS = 12
33 | REACTOR_THREADPOOL_MAXSIZE = 8
34 |
35 | # Configure a delay for requests for the same website (default: 0)
36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
37 | # See also autothrottle settings and docs
38 | DOWNLOAD_DELAY = 1
39 | # The download delay setting will honor only one of:
40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
41 | # CONCURRENT_REQUESTS_PER_IP = 16
42 |
43 | # Disable cookies (enabled by default)
44 | COOKIES_ENABLED = False
45 |
46 | # Disable Telnet Console (enabled by default)
47 | # TELNETCONSOLE_ENABLED = False
48 |
49 | # Override the default request headers:
50 | # DEFAULT_REQUEST_HEADERS = {
51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 | # 'Accept-Language': 'en',
53 | # }
54 |
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | # SPIDER_MIDDLEWARES = {
58 | # 'guju.middlewares.MyCustomSpiderMiddleware': 543,
59 | # }
60 |
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | DOWNLOADER_MIDDLEWARES = {
64 | 'msic.scrapy.middlewares.CustomUserAgentMiddleware': 2,
65 | 'guju.middlewares.RedirectionMiddleware': 998,
66 | }
67 | if USE_PROXY:
68 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CustomHttpProxyMiddleware'] = 1
69 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CatchExceptionMiddleware'] = 999
70 |
71 | # Enable or disable extensions
72 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
73 | # EXTENSIONS = {
74 | # 'scrapy.extensions.telnet.TelnetConsole': None,
75 | # }
76 |
77 | # Configure item pipelines
78 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
79 | ITEM_PIPELINES = {
80 | }
81 |
82 | # Enable and configure the AutoThrottle extension (disabled by default)
83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
84 | # AUTOTHROTTLE_ENABLED = True
85 | # The initial download delay
86 | # AUTOTHROTTLE_START_DELAY = 5
87 | # The maximum download delay to be set in case of high latencies
88 | # AUTOTHROTTLE_MAX_DELAY = 60
89 | # The average number of requests Scrapy should be sending in parallel to
90 | # each remote server
91 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
92 | # Enable showing throttling stats for every response received:
93 | # AUTOTHROTTLE_DEBUG = False
94 |
95 | # Enable and configure HTTP caching (disabled by default)
96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
97 | # HTTPCACHE_ENABLED = True
98 | # HTTPCACHE_EXPIRATION_SECS = 0
99 | # HTTPCACHE_DIR = 'httpcache'
100 | # HTTPCACHE_IGNORE_HTTP_CODES = []
101 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
102 |
103 | AJAXCRAWL_ENABLED = False
104 | IMAGES_STORE = 'D:/scrapy'
105 |
106 | LOG_ENABLED = True
107 | LOG_FORMAT = '%(asctime)s,%(msecs)d [%(name)s] %(levelname)s: %(message)s'
108 |
--------------------------------------------------------------------------------
/tubatu/tubatu/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for tubatu project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | import os
12 | import sys
13 | from os.path import dirname
14 |
15 | from tubatu.config import USE_PROXY
16 |
17 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
18 | sys.path.append(path)
19 |
20 | BOT_NAME = 'tubatu'
21 |
22 | SPIDER_MODULES = ['tubatu.spiders']
23 | NEWSPIDER_MODULE = 'tubatu.spiders'
24 |
25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
26 | # USER_AGENT = 'tubatu (+http://www.yourdomain.com)'
27 |
28 | # Obey robots.txt rules
29 | ROBOTSTXT_OBEY = False
30 |
31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
32 | CONCURRENT_REQUESTS = 12
33 | REACTOR_THREADPOOL_MAXSIZE = 8
34 |
35 | # Configure a delay for requests for the same website (default: 0)
36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
37 | # See also autothrottle settings and docs
38 | DOWNLOAD_DELAY = 0
39 | # The download delay setting will honor only one of:
40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
41 | # CONCURRENT_REQUESTS_PER_IP = 16
42 |
43 | # Disable cookies (enabled by default)
44 | COOKIES_ENABLED = False
45 |
46 | # Disable Telnet Console (enabled by default)
47 | # TELNETCONSOLE_ENABLED = False
48 |
49 | # Override the default request headers:
50 | # DEFAULT_REQUEST_HEADERS = {
51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 | # 'Accept-Language': 'en',
53 | # }
54 |
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | # SPIDER_MIDDLEWARES = {
58 | # 'tubatu.middlewares.MyCustomSpiderMiddleware': 543,
59 | # }
60 |
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | DOWNLOADER_MIDDLEWARES = {
64 | 'msic.scrapy.middlewares.CustomUserAgentMiddleware': 2,
65 | 'tubatu.middlewares.RedirectionMiddleware': 998,
66 | }
67 |
68 | if USE_PROXY:
69 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CustomHttpProxyMiddleware'] = 1
70 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CatchExceptionMiddleware'] = 999
71 |
72 | # Enable or disable extensions
73 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
74 | # EXTENSIONS = {
75 | # 'scrapy.extensions.telnet.TelnetConsole': None,
76 | # }
77 |
78 | # Configure item pipelines
79 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
80 | ITEM_PIPELINES = {
81 | }
82 |
83 | # Enable and configure the AutoThrottle extension (disabled by default)
84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
85 | # AUTOTHROTTLE_ENABLED = True
86 | # The initial download delay
87 | # AUTOTHROTTLE_START_DELAY = 5
88 | # The maximum download delay to be set in case of high latencies
89 | # AUTOTHROTTLE_MAX_DELAY = 60
90 | # The average number of requests Scrapy should be sending in parallel to
91 | # each remote server
92 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
93 | # Enable showing throttling stats for every response received:
94 | # AUTOTHROTTLE_DEBUG = False
95 |
96 | # Enable and configure HTTP caching (disabled by default)
97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
98 | # HTTPCACHE_ENABLED = True
99 | # HTTPCACHE_EXPIRATION_SECS = 0
100 | # HTTPCACHE_DIR = 'httpcache'
101 | # HTTPCACHE_IGNORE_HTTP_CODES = []
102 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
103 |
104 |
105 |
106 | AJAXCRAWL_ENABLED = False
107 | IMAGES_STORE = 'C:/scrapy'
108 |
109 | LOG_ENABLED = True
110 | LOG_FORMAT = '%(asctime)s,%(msecs)d [%(name)s] %(levelname)s: %(message)s'
111 |
112 | # MEMDEBUG_ENABLED = True
113 | # MEMDEBUG_NOTIFY = ['imflyn@163.com']
114 |
--------------------------------------------------------------------------------
/msic/proxy/proxy_pool.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import time
3 | from datetime import datetime
4 |
5 | import pymongo
6 | from schedule import Scheduler
7 |
8 | from msic import config
9 | from msic.common import utils
10 | from msic.core.service import mongodb_service
11 | from msic.proxy import proxy_strategy
12 |
13 | TASK_INTERVAL = 60
14 | FAILED_COUNT_BORDER = 3
15 | MIN_PROXY_COUNT = 10
16 |
17 | REDIS_KEY_LAST_CHECK_IP_TIME = "last_check_ip_time"
18 |
19 |
20 | class ProxyPool(object):
21 | TABLE_NAME = 'proxy_pool'
22 |
23 | def __init__(self):
24 | self.redis_client = config.redis_client
25 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME)
26 | self.collection.create_index([('ip', pymongo.ASCENDING)], unique=True, sparse=True)
27 |
28 | # Singleton
29 | def __new__(cls, *args, **kwargs):
30 | if not hasattr(cls, '_instance'):
31 | org = super(ProxyPool, cls)
32 | cls._instance = org.__new__(cls, *args)
33 | return cls._instance
34 |
35 | def random_choice_proxy(self) -> str:
36 | proxy = self.collection.find().sort(
37 | [("failed_count", pymongo.ASCENDING), ("validity", pymongo.DESCENDING), ("response_speed", pymongo.ASCENDING),
38 | ("update_time", pymongo.DESCENDING)])
39 | return proxy[0]['ip']
40 |
41 | def add_failed_time(self, ip):
42 | proxy = self.collection.find_one({'ip': ip})
43 | if proxy is not None:
44 | failed_count = proxy['failed_count'] + 1
45 | utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count))
46 | if failed_count <= FAILED_COUNT_BORDER:
47 | try:
48 | self.collection.update_one({'ip': ip}, {"$set": {'update_time': utils.get_utc_time(), 'failed_count': failed_count}})
49 | except:
50 | pass
51 | else:
52 | try:
53 | self.collection.delete_one({'ip': ip})
54 | except:
55 | pass
56 | self.crawl_proxy_task()
57 |
58 | def crawl_proxy_task(self, check_num: bool = True):
59 | if check_num:
60 | count = self.collection.count()
61 | if count > MIN_PROXY_COUNT:
62 | return
63 | utils.log("开始抓取代理")
64 | proxy_list = proxy_strategy.crawl_proxy()
65 | utils.log("开始保存")
66 | for proxy in proxy_list:
67 | if not self.collection.find_one({'ip': proxy.ip}):
68 | self.collection.insert_one(proxy.__dict__)
69 | utils.log('保存了:' + proxy.ip)
70 | utils.log("保存结束")
71 |
72 | def check_ip_availability_task(self):
73 | last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME)
74 | now_time = datetime.utcnow().timestamp()
75 | if last_check_time is not None and (now_time - float(last_check_time)) < (TASK_INTERVAL * 60):
76 | return
77 | self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time)
78 |
79 | proxy_list = self.collection.find()
80 | for proxy in proxy_list:
81 | ip = proxy['ip']
82 | start_time = time.time()
83 | response = utils.http_request('http://www.baidu.com', timeout=10)
84 | is_success = response.status_code == 200
85 | response.close()
86 | if not is_success:
87 | try:
88 | self.collection.delete_one({'ip': ip})
89 | except:
90 | pass
91 | utils.log('Check ip %s FAILED' % ip)
92 | else:
93 | elapsed = round(time.time() - start_time, 4)
94 | try:
95 | self.collection.update_one({'ip': ip},
96 | {"$set": {'update_time': utils.get_utc_time(), 'response_speed': elapsed, 'validity': True}})
97 | except:
98 | pass
99 | utils.log('Check ip %s SUCCESS' % ip)
100 |
101 | def start(self):
102 | self.crawl_proxy_task(False)
103 |
104 | def task():
105 | self.check_ip_availability_task()
106 | schedule = Scheduler()
107 | schedule.every(10).minutes.do(self.check_ip_availability_task)
108 |
109 | while True:
110 | schedule.run_pending()
111 | time.sleep(1)
112 |
113 | thread = threading.Thread(target=task)
114 | thread.start()
115 |
116 | def drop_proxy(self):
117 | self.collection.delete_many({})
118 |
119 |
120 | proxy_pool = ProxyPool()
121 |
--------------------------------------------------------------------------------
/tubatu/tubatu/service/design_picture_service.py:
--------------------------------------------------------------------------------
1 | from tubatu.items import DesignPictureItem
2 | from tubatu.model.design_picture import DesignPictureModel, DesignPictureSummaryModel
3 | from tubatu.service.design_service import DesignService
4 |
5 | from msic.common import log
6 | from msic.common import utils
7 | from msic.core.service import mongodb_service
8 | from tubatu import config
9 |
10 |
11 | class DesignPictureService(DesignService):
12 | TABLE_NAME = "design_picture"
13 | TABLE_NAME_SUMMARY = "design_picture_summary"
14 | REDIS_KEY = "tubatu_design_picture_filter"
15 |
16 | def __init__(self):
17 | super(DesignPictureService, self).__init__()
18 | self.summary_collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME_SUMMARY)
19 |
20 | def get_design_picture_model(self, design_picture_item: DesignPictureItem) -> DesignPictureModel:
21 | design_picture_model = DesignPictureModel()
22 | design_picture_model.id = utils.get_uuid()
23 | design_picture_model.fid = design_picture_item['fid']
24 | design_picture_model.title = design_picture_item['title']
25 | design_picture_model.sub_title = design_picture_item['sub_title']
26 | design_picture_model.html_url = design_picture_item['html_url']
27 | design_picture_model.tags = design_picture_item['tags']
28 | design_picture_model.description = design_picture_item['description']
29 | design_picture_model.img_url = design_picture_item['img_url']
30 | design_picture_model.img_width = design_picture_item['img_width']
31 | design_picture_model.img_height = design_picture_item['img_height']
32 | design_picture_model.img_name = design_picture_item['img_name']
33 | design_picture_model.create_time = utils.get_utc_time()
34 | return design_picture_model
35 |
36 | def create_design_picture_summary_model(self, design_picture_model: DesignPictureModel) -> DesignPictureSummaryModel:
37 | design_picture_summary_model = DesignPictureSummaryModel()
38 | design_picture_summary_model.id = design_picture_model.fid
39 | design_picture_summary_model.cid = [design_picture_model.id]
40 | design_picture_summary_model.title = design_picture_model.title
41 | design_picture_summary_model.description = design_picture_model.description
42 | design_picture_summary_model.tags = design_picture_model.tags
43 | design_picture_summary_model.html_url = design_picture_model.html_url
44 | design_picture_summary_model.create_time = utils.get_utc_time()
45 | design_picture_summary_model.update_time = design_picture_summary_model.create_time
46 | design_picture_summary_model.cover_img_url = design_picture_model.img_url
47 | design_picture_summary_model.cover_img_width = design_picture_model.img_width
48 | design_picture_summary_model.cover_img_height = design_picture_model.img_height
49 | design_picture_summary_model.cover_img_name = design_picture_model.img_name
50 | return design_picture_summary_model
51 |
52 | def handle_item(self, design_picture_item: DesignPictureItem):
53 | if self.is_duplicate_url(design_picture_item['img_url']):
54 | return
55 | design_picture_model = self.get_design_picture_model(design_picture_item)
56 | self.save_to_database(self.collection, design_picture_model)
57 |
58 | summary_model = self.find_one(self.summary_collection, {'id': design_picture_model.fid})
59 | if summary_model is None:
60 | summary_model = self.create_design_picture_summary_model(design_picture_model)
61 | self.save_to_database(self.summary_collection, summary_model)
62 | else:
63 | tags = list(set(summary_model['tags']).union(set(design_picture_model.tags)))
64 | summary_model['cid'].append(design_picture_model.id)
65 | self.update_one(self.summary_collection, {'id': summary_model['id']},
66 | {'update_time': utils.get_utc_time(), 'tags': tags, 'cid': summary_model['cid']})
67 | self.insert_to_redis(design_picture_model.img_url)
68 |
69 | log.info("=========================================================================================")
70 | log.info("title:" + design_picture_item['title'])
71 | log.info("sub_title:" + design_picture_item['sub_title'])
72 | log.info("original_width:" + design_picture_item['img_width'])
73 | log.info("original_height:" + design_picture_item['img_height'])
74 | log.info("html_url:" + design_picture_item['html_url'])
75 | log.info("img_url:" + design_picture_item['img_url'])
76 | log.info("description:" + design_picture_item['description'])
77 | log.info("tags:%s" % ','.join(map(str, design_picture_item['tags'])))
78 | log.info("=========================================================================================")
79 |
--------------------------------------------------------------------------------
/msic/proxy/proxy_strategy.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 |
3 | from msic.common import utils
4 | from msic.proxy.proxy import Proxy
5 | import time
6 |
7 |
8 | class GetProxyStrategy(object):
9 | URL = ''
10 |
11 | def __init__(self):
12 | self.content = ''
13 |
14 | def execute(self) -> []:
15 | self.content = utils.http_request(self.URL).text
16 |
17 |
18 | class GetXiciProxyStrategy(GetProxyStrategy):
19 | SPEED = 100
20 | NAME = 'Xici'
21 |
22 | def execute(self):
23 | super(GetXiciProxyStrategy, self).execute()
24 | ip = []
25 | soup = BeautifulSoup(self.content, 'html.parser')
26 | ip_list = soup.find('table', id='ip_list')
27 | ip_tr_list = ip_list.find_all('tr', limit=101)
28 | for index, ip_tr in enumerate(ip_tr_list):
29 | if index == 0:
30 | continue
31 | ip_td = ip_tr.find_all('td')
32 | address = ''
33 | port = ''
34 | is_high_quality = True
35 | for num, data in enumerate(ip_td):
36 | if num == 1:
37 | address = data.getText()
38 | elif num == 2:
39 | port = data.getText()
40 | elif num == 6 or num == 7:
41 | try:
42 | value = data.find('div', class_='bar').find('div').attrs['style'] # type:str
43 | is_high_quality = is_high_quality and int(value.replace('width:', '').replace('%', '')) > self.SPEED
44 | except:
45 | break
46 | elif num > 7:
47 | break
48 | if is_high_quality:
49 | ip.append(address + ':' + port)
50 | return ip
51 |
52 |
53 | class GetXiciChinaProxyStrategy(GetXiciProxyStrategy):
54 | URL = 'http://www.xicidaili.com/nn/'
55 | SPEED = 85
56 |
57 |
58 | class GetXiciForeignProxyStrategy(GetXiciProxyStrategy):
59 | URL = 'http://www.xicidaili.com/wn/'
60 | SPEED = 80
61 |
62 |
63 | class Get66ipProxyStrategy(GetProxyStrategy):
64 | NAME = '66ip'
65 | URL = 'http://www.66ip.cn/nmtq.php?getnum=800&isp=0&anonymoustype=4&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip'
66 |
67 | def execute(self):
68 | super(Get66ipProxyStrategy, self).execute()
69 | soup = BeautifulSoup(self.content, 'html.parser')
70 | ip = []
71 | for br in soup.findAll('br'):
72 | ip.append(br.next.strip())
73 | return ip
74 |
75 |
76 | class GetKuaidailiProxyStrategy(GetProxyStrategy):
77 | NAME = 'Kuaidaili'
78 | URL = 'http://www.kuaidaili.com/free/inha/%s/'
79 | SPEED = 5
80 |
81 | def execute(self):
82 | ip = []
83 | for num in range(1, 10):
84 | url = self.URL % num
85 | context = utils.http_request(url).text
86 | ip = ip + self.parse(context)
87 | time.sleep(3)
88 | return ip
89 |
90 | def parse(self, content) -> []:
91 | ip = []
92 | soup = BeautifulSoup(content, 'html.parser')
93 | ip_table = soup.find('tbody')
94 | ip_tr_list = ip_table.find_all('tr')
95 | for ip_tr in ip_tr_list:
96 | ip_td = ip_tr.find_all('td')
97 | address = ''
98 | port = ''
99 | is_high_quality = True
100 | for num, data in enumerate(ip_td):
101 | if num == 0:
102 | address = data.getText()
103 | elif num == 1:
104 | port = data.getText()
105 | elif num == 2:
106 | is_high_quality = data.getText() == '高匿名'
107 | if not is_high_quality:
108 | break
109 | elif num == 6:
110 | try:
111 | is_high_quality = is_high_quality and float(data.getText()[:-1]) < self.SPEED
112 | break
113 | except:
114 | break
115 | if is_high_quality:
116 | ip.append(address + ':' + port)
117 | return ip
118 |
119 |
120 | def crawl_proxy() -> []:
121 | proxy_list = []
122 |
123 | def get_proxy_list(_strategy):
124 | _proxy_list = []
125 | _ip_list = _strategy.execute()
126 | for ip in _ip_list:
127 | if ip.strip() == '':
128 | continue
129 | _proxy = Proxy.create(ip, _strategy.NAME)
130 | _proxy_list.append(_proxy)
131 | return _proxy_list
132 |
133 | proxy_list += get_proxy_list(GetKuaidailiProxyStrategy())
134 | # proxy_list += get_proxy_list(Get66ipProxyStrategy())
135 | # proxy_list += get_proxy_list(GetXiciChinaProxyStrategy())
136 | # proxy_list += get_proxy_list(GetXiciForeignProxyStrategy())
137 | return proxy_list
138 |
--------------------------------------------------------------------------------
/tubatu/tubatu/spiders/design_picture_spider.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import scrapy
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.selector import Selector
6 | from scrapy.spiders import CrawlSpider
7 | from scrapy.spiders import Rule
8 | from tubatu.constants import ZONE_TYPE, STYLE_ID, AREA, COLOR_ID, HX_ID, PART_ID
9 | from tubatu.items import DesignPictureItem
10 | from tubatu.service.design_picture_service import DesignPictureService
11 |
12 | from msic.common import log, constant
13 | from msic.common import utils
14 | from msic.proxy.proxy_pool import proxy_pool
15 | from tubatu import config
16 |
17 |
18 | class DesignPictureSpider(CrawlSpider):
19 | start_url_domain = 'xiaoguotu.to8to.com'
20 | name = 'design_picture'
21 | allowed_domains = ['to8to.com']
22 | start_urls = ['http://xiaoguotu.to8to.com/tuce/']
23 | rules = (
24 | Rule(LinkExtractor(allow="/tuce/p_\d+.html"), follow=True, callback='parse_list'),
25 | )
26 | custom_settings = {
27 | 'ITEM_PIPELINES': {
28 | 'tubatu.pipelines.DesignPicturePipeline': 302,
29 | }
30 | }
31 | design_picture_service = DesignPictureService()
32 |
33 | def parse_list(self, response):
34 | selector = Selector(response)
35 | items_selector = selector.xpath('//div[@class="xmp_container"]//div[@class="item"]')
36 | for item_selector in items_selector:
37 | # http://xiaoguotu.to8to.com/c10037052.html
38 | cid = item_selector.xpath('div//a/@href').extract()[0][23:-6]
39 | title = item_selector.xpath('div//a/@title').extract()[0]
40 | # http://xiaoguotu.to8to.com/getxgtjson.php?a2=0&a12=&a11=10037052&a1=0
41 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/case/list?a2=0&a12=&a11={cid}&a1=0').format(cid=cid)
42 | yield scrapy.Request(next_url, self.parse_content, meta={'cid': cid, 'title': title})
43 |
44 | def parse_content(self, response):
45 | uuid = utils.get_uuid()
46 | cid = response.meta['cid']
47 | title = response.meta['title']
48 | try:
49 | data = json.loads(response.text)
50 | except:
51 | print("-----------------------获取到json:" + response.text + "------------------------------")
52 | return
53 | data_img_list = data['dataImg']
54 | for _data_img in data_img_list:
55 | data_album_list = _data_img['album']
56 | for data_album in data_album_list:
57 | data_img = data_album['l']
58 | # http://pic.to8to.com/case/1605/05/20160505_f0af86a239d0b02e9635a47ih5l1riuq_sp.jpg
59 | img_url = 'http://pic.to8to.com/case/{short_name}'.format(short_name=data_img['s'])
60 | if self.design_picture_service.is_duplicate_url(img_url):
61 | break
62 | sub_title = data_img['t']
63 | original_width = data_img['w']
64 | original_height = data_img['h']
65 | tags = []
66 | try:
67 | zoom_type = ZONE_TYPE[data_img['zid']]
68 | if zoom_type is not None or not zoom_type.strip() == '':
69 | tags.append(zoom_type)
70 | except KeyError:
71 | pass
72 | try:
73 | style_id = STYLE_ID[data_img['sid']]
74 | if style_id is not None or not style_id.strip() == '':
75 | tags.append(style_id)
76 | except KeyError:
77 | pass
78 | try:
79 | area = AREA[data_img['a']]
80 | if area is not None or not area.strip() == '':
81 | tags.append(area)
82 | except KeyError:
83 | pass
84 | try:
85 | color_id = COLOR_ID[data_img['coid']]
86 | if color_id is not None or not color_id.strip() == '':
87 | tags.append(color_id)
88 | except KeyError:
89 | pass
90 | try:
91 | house_type = HX_ID[data_img['hxid']]
92 | if house_type is not None or not house_type.strip() == '':
93 | tags.append(house_type)
94 | except KeyError:
95 | pass
96 | try:
97 | part = PART_ID[data_img['pid']]
98 | if part is not None or not part.strip() == '':
99 | tags.append(part)
100 | except KeyError:
101 | pass
102 | try:
103 | design_picture_item = DesignPictureItem() # type: DesignPictureItem
104 | design_picture_item['fid'] = uuid
105 | design_picture_item['html_url'] = response.url
106 | design_picture_item['img_url'] = img_url
107 | design_picture_item['tags'] = tags
108 | design_picture_item['title'] = title
109 | design_picture_item['sub_title'] = sub_title
110 | design_picture_item['img_width'] = str(original_width)
111 | design_picture_item['img_height'] = str(original_height)
112 | design_picture_item['description'] = design_picture_item['title']
113 | yield design_picture_item
114 | except Exception as e:
115 | print("-----------------------获取到json:" + response.text + "------------------------------")
116 | log.warn("%s ( refer: %s )" % (e, response.url))
117 | if config.USE_PROXY:
118 | proxy_pool.add_failed_time(response.meta['proxy'].replace('http://', ''))
119 |
--------------------------------------------------------------------------------
/msic/common/agents.py:
--------------------------------------------------------------------------------
1 | AGENTS_ALL = [
2 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
3 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
4 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
5 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
6 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
7 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
8 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
9 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
10 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
11 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
13 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
15 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
16 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
18 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
19 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
20 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
21 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
22 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
23 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
24 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
25 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
26 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
27 | "Mozilla/2.02E (Win95; U)",
28 | "Mozilla/3.01Gold (Win95; I)",
29 | "Mozilla/4.8 [en] (Windows NT 5.1; U)",
30 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
31 | "Opera/7.50 (Windows XP; U)",
32 | "Opera/7.50 (Windows ME; U) [en]",
33 | "Opera/7.51 (Windows NT 5.1; U) [en]",
34 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0",
35 | "Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.2b) Gecko/20021001 Phoenix/0.2",
36 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.23) Gecko/20090825 SeaMonkey/1.1.18",
37 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.6 (Change: )",
38 | "Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.8.0",
39 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)",
40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8",
41 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0",
42 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
43 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
44 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
45 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
46 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1",
47 | "Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0",
48 | "Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1",
49 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1",
50 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0",
51 | "iTunes/9.0.2 (Windows; N)",
52 | "Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)",
53 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko) Maxthon/3.0.8.2 Safari/533.1",
54 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
55 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
56 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)",
57 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)",
58 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
59 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)",
60 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)",
62 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)",
63 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
64 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)",
65 | "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
66 | "Opera/9.25 (Windows NT 6.0; U; en)",
67 | "Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10",
68 | "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00",
69 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01",
70 | "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
71 | "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
72 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
73 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
74 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
75 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
76 | "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
77 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
78 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
79 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
80 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
81 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
82 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
83 | "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
84 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
85 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
86 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
87 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
88 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
89 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0",
90 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0",
91 | "Mozilla/4.0 (compatible; MSIE 5.15; Mac_PowerPC)",
92 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US) AppleWebKit/125.4 (KHTML, like Gecko, Safari) OmniWeb/v563.15",
93 | "Opera/9.0 (Macintosh; PPC Mac OS X; U; en)",
94 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/125.2 (KHTML, like Gecko) Safari/85.8",
95 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/125.2 (KHTML, like Gecko) Safari/125.8",
96 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; fr-fr) AppleWebKit/312.5 (KHTML, like Gecko) Safari/312.3",
97 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418.8 (KHTML, like Gecko) Safari/419.3",
98 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Camino/2.2.1",
99 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre Camino/2.2a1pre",
100 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
101 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.79 Safari/537.4",
102 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1",
103 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0",
104 | "iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)",
105 | "iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)",
106 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) AppleWebKit/528.16 (KHTML, like Gecko, Safari/528.16) OmniWeb/v622.8.0.112941",
107 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/528.16 (KHTML, like Gecko, Safari/528.16) OmniWeb/v622.8.0",
108 | "Opera/9.20 (Macintosh; Intel Mac OS X; U; en)",
109 | "Opera/9.64 (Macintosh; PPC Mac OS X; U; en) Presto/2.1.1",
110 | "Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.6.30 Version/10.61",
111 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.4.11; U; en) Presto/2.7.62 Version/11.00",
112 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
113 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
114 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
115 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
116 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7; en-us) AppleWebKit/534.20.8 (KHTML, like Gecko) Version/5.1 Safari/534.20.8",
117 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
118 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
119 | "ELinks (0.4pre5; Linux 2.6.10-ac7 i686; 80x33)",
120 | "ELinks/0.9.3 (textmode; Linux 2.6.9-kanotix-8 i686; 127x41)",
121 | "ELinks/0.12~pre5-4",
122 | "Links/0.9.1 (Linux 2.4.24; i386;)",
123 | "Links (2.1pre15; Linux 2.4.26 i686; 158x61)",
124 | "Links (2.3pre1; Linux 2.6.38-8-generic x86_64; 170x48)",
125 | "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/0.8.12",
126 | "w3m/0.5.1",
127 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.4 (KHTML, like Gecko) Chrome/4.0.237.0 Safari/532.4 Debian",
128 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.277.0 Safari/532.8",
129 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.309.0 Safari/532.9",
130 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
131 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/540.0 (KHTML, like Gecko) Ubuntu/10.10 Chrome/9.1.0.0 Safari/540.0",
132 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.15 (KHTML, like Gecko) Chrome/10.0.613.0 Safari/534.15",
133 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.15 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.613.0 Chrome/10.0.613.0 Safari/534.15",
134 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.24 (KHTML, like Gecko) Ubuntu/10.10 Chromium/12.0.703.0 Chrome/12.0.703.0 Safari/534.24",
135 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.20 Safari/535.1",
136 | "Mozilla/5.0 Slackware/13.37 (X11; U; Linux x86_64; en-US) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41",
137 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.825.0 Chrome/14.0.825.0 Safari/535.1",
138 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Ubuntu/11.10 Chromium/15.0.874.120 Chrome/15.0.874.120 Safari/535.2",
139 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Epiphany/1.2.5",
140 | "Mozilla/5.0 (X11; U; Linux i586; en-US; rv:1.7.3) Gecko/20040924 Epiphany/1.4.4 (Ubuntu)",
141 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040614 Firefox/0.8",
142 | "Mozilla/5.0 (X11; U; Linux x86_64; sv-SE; rv:1.8.1.12) Gecko/20080207 Ubuntu/7.10 (gutsy) Firefox/2.0.0.12",
143 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.11) Gecko/2009060309 Ubuntu/9.10 (karmic) Firefox/3.0.11",
144 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.2) Gecko/20090803 Ubuntu/9.04 (jaunty) Shiretoko/3.5.2",
145 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.5) Gecko/20091107 Firefox/3.5.5",
146 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3) Gecko/20091020 Linux Mint/8 (Helena) Firefox/3.5.3",
147 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.9) Gecko/20100915 Gentoo Firefox/3.6.9",
148 | "Mozilla/5.0 (X11; U; Linux i686; pl-PL; rv:1.9.0.2) Gecko/20121223 Ubuntu/9.25 (jaunty) Firefox/3.8",
149 | "Mozilla/5.0 (X11; Linux i686; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre",
150 | "Mozilla/5.0 (X11; Linux i686 on x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
151 | "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
152 | "Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
153 | "Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20100101 Firefox/4.2a1pre",
154 | "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/20100101 Firefox/5.0",
155 | "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0",
156 | "Mozilla/5.0 (X11; Linux x86_64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1",
157 | "Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0",
158 | "Mozilla/5.0 (X11; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1",
159 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Galeon/1.3.14",
160 | "Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.8.1.13) Gecko/20080313 Iceape/1.1.9 (Debian-1.1.9-5)",
161 | "Mozilla/5.0 (X11; U; Linux i686; pt-PT; rv:1.9.2.3) Gecko/20100402 Iceweasel/3.6.3 (like Firefox/3.6.3) GTB7.0",
162 | "Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 Iceweasel/5.0",
163 | "Mozilla/5.0 (X11; Linux i686; rv:6.0a2) Gecko/20110615 Firefox/6.0a2 Iceweasel/6.0a2",
164 | "Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)",
165 | "Mozilla/5.0 (compatible; Konqueror/3.3; Linux 2.6.8-gentoo-r3; X11;",
166 | "Mozilla/5.0 (compatible; Konqueror/3.5; Linux 2.6.30-7.dmz.1-liquorix-686; X11) KHTML/3.5.10 (like Gecko) (Debian package 4:3.5.10.dfsg.1-1 b1)",
167 | "Mozilla/5.0 (compatible; Konqueror/3.5; Linux; en_US) KHTML/3.5.6 (like Gecko) (Kubuntu)",
168 | "Mozilla/5.0 (X11; Linux x86_64; en-US; rv:2.0b2pre) Gecko/20100712 Minefield/4.0b2pre",
169 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7",
170 | "MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23",
171 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061024 Firefox/2.0 (Swiftfox)",
172 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.10.1",
173 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
174 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
175 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.56 Safari/537.4",
176 | "Mozilla/4.0 (compatible; Dillo 3.0)",
177 | "Mozilla/5.0 (X11; U; Linux i686; en-us) AppleWebKit/528.5 (KHTML, like Gecko, Safari/528.5 ) lt-GtkLauncher",
178 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.16) Gecko/20120421 Gecko Firefox/11.0",
179 | "Mozilla/5.0 (X11; Linux i686; rv:12.0) Gecko/20100101 Firefox/12.0 ",
180 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
181 | "Mozilla/5.0 (X11; Linux i686; rv:16.0) Gecko/20100101 Firefox/16.0",
182 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Galeon/2.0.6 (Ubuntu 2.0.6-2)",
183 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080716 (Gentoo) Galeon/2.0.6",
184 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.13) Gecko/20100916 Iceape/2.0.8",
185 | "Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0.1 Iceweasel/14.0.1",
186 | "Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20120724 Debian Iceweasel/15.02",
187 | "Mozilla/5.0 (compatible; Konqueror/4.2; Linux) KHTML/4.2.4 (like Gecko) Slackware/13.0",
188 | "Mozilla/5.0 (compatible; Konqueror/4.3; Linux) KHTML/4.3.1 (like Gecko) Fedora/4.3.1-3.fc11",
189 | "Mozilla/5.0 (compatible; Konqueror/4.4; Linux) KHTML/4.4.1 (like Gecko) Fedora/4.4.1-1.fc12",
190 | "Mozilla/5.0 (compatible; Konqueror/4.4; Linux 2.6.32-22-generic; X11; en_US) KHTML/4.4.3 (like Gecko) Kubuntu",
191 | "Midori/0.1.10 (X11; Linux i686; U; en-us) WebKit/(531).(2) ",
192 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.3) Gecko/2008092814 (Debian-3.0.1-1)",
193 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330",
194 | "Opera/9.64 (X11; Linux i686; U; Linux Mint; nb) Presto/2.1.1",
195 | "Opera/9.80 (X11; Linux i686; U; en) Presto/2.2.15 Version/10.10",
196 | "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
197 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.34 (KHTML, like Gecko) QupZilla/1.2.0 Safari/534.34",
198 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.17) Gecko/20110123 SeaMonkey/2.0.12",
199 | "Mozilla/5.0 (X11; Linux i686; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
200 | "Mozilla/5.0 (X11; U; Linux x86_64; us; rv:1.9.1.19) Gecko/20110430 shadowfox/7.0 (like Firefox/7.0",
201 | "Mozilla/5.0 (X11; U; Linux i686; it; rv:1.9.2.3) Gecko/20100406 Firefox/3.6.3 (Swiftfox)",
202 | "Uzbl (Webkit 1.3) (Linux i686 [i686])",
203 | "ELinks (0.4.3; NetBSD 3.0.2PATCH sparc64; 141x19)",
204 | "Links (2.1pre15; FreeBSD 5.3-RELEASE i386; 196x84)",
205 | "Lynx/2.8.7dev.4 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.8d",
206 | "w3m/0.5.1",
207 | "Mozilla/5.0 (X11; U; FreeBSD i386; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
208 | "Mozilla/5.0 (X11; U; OpenBSD i386; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.359.0 Safari/533.3",
209 | "Mozilla/5.0 (X11; U; FreeBSD x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16",
210 | "Mozilla/5.0 (X11; U; SunOS sun4m; en-US; rv:1.4b) Gecko/20030517 Mozilla Firebird/0.6",
211 | "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.6) Gecko/20040406 Galeon/1.3.15",
212 | "Mozilla/5.0 (compatible; Konqueror/3.5; NetBSD 4.0_RC3; X11) KHTML/3.5.7 (like Gecko)",
213 | "Mozilla/5.0 (compatible; Konqueror/3.5; SunOS) KHTML/3.5.1 (like Gecko)",
214 | "Mozilla/5.0 (X11; U; FreeBSD; i386; en-US; rv:1.7) Gecko",
215 | "Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)",
216 | "Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)",
217 | "Mozilla/5.0 (Unknown; U; UNIX BSD/SYSV system; C -) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.10.2",
218 | "Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.56 Safari/536.5",
219 | "Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.79 Safari/537.4",
220 | "Mozilla/5.0 (X11; U; OpenBSD arm; en-us) AppleWebKit/531.2 (KHTML, like Gecko) Safari/531.2 Epiphany/2.30.0",
221 | "Mozilla/5.0 (X11; U; FreeBSD amd64; en-us) AppleWebKit/531.2 (KHTML, like Gecko) Safari/531.2 Epiphany/2.30.0",
222 | "Mozilla/5.0 (X11; U; SunOS i86pc; en-US; rv:1.9.1b3) Gecko/20090429 Firefox/3.1b3",
223 | "Mozilla/5.0 (X11; U; OpenBSD i386; en-US; rv:1.9.1) Gecko/20090702 Firefox/3.5",
224 | "Mozilla/5.0 (X11; U; FreeBSD i386; de-CH; rv:1.9.2.8) Gecko/20100729 Firefox/3.6.8",
225 | "Mozilla/5.0 (X11; FreeBSD amd64; rv:5.0) Gecko/20100101 Firefox/5.0",
226 | "Mozilla/5.0 (compatible; Konqueror/4.1; DragonFly) KHTML/4.1.4 (like Gecko)",
227 | "Mozilla/5.0 (compatible; Konqueror/4.1; OpenBSD) KHTML/4.1.4 (like Gecko)",
228 | "Mozilla/5.0 (compatible; Konqueror/4.5; NetBSD 5.0.2; X11; amd64; en_US) KHTML/4.5.4 (like Gecko)",
229 | "Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)",
230 | "Mozilla/5.0 (X11; U; NetBSD amd64; en-US; rv:1.9.2.15) Gecko/20110308 Namoroka/3.6.15",
231 | "NetSurf/1.2 (NetBSD; amd64)",
232 | "Opera/9.80 (X11; FreeBSD 8.1-RELEASE i386; Edition Next) Presto/2.12.388 Version/12.10",
233 | "Mozilla/5.0 (X11; U; SunOS i86pc; en-US; rv:1.8.1.12) Gecko/20080303 SeaMonkey/1.1.8",
234 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; BOLT/2.800) AppleWebKit/534.6 (KHTML, like Gecko) Version/5.0 Safari/534.6.3",
235 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)",
236 | "Mozilla/1.22 (compatible; MSIE 5.01; PalmOS 3.0) EudoraWeb 2.1",
237 | "Mozilla/5.0 (WindowsCE 6.0; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
238 | "Mozilla/5.0 (X11; U; Linux armv61; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1",
239 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1",
240 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 Fennec/10.0.1",
241 | "Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016",
242 | "Mozilla/5.0 (X11; U; Linux armv6l; rv 1.8.1.5pre) Gecko/20070619 Minimo/0.020",
243 | "Mozilla/5.0 (X11; U; Linux arm7tdmi; rv:1.8.1.11) Gecko/20071130 Minimo/0.025",
244 | "Mozilla/4.0 (PDA; PalmOS/sony/model prmr/Revision:1.1.54 (en)) NetFront/3.0",
245 | "Opera/9.51 Beta (Microsoft Windows; PPC; Opera Mobi/1718; U; en)",
246 | "Opera/9.60 (J2ME/MIDP; Opera Mini/4.1.11320/608; U; en) Presto/2.2.0",
247 | "Opera/9.60 (J2ME/MIDP; Opera Mini/4.2.14320/554; U; cs) Presto/2.2.0",
248 | "Opera/9.80 (S60; SymbOS; Opera Mobi/499; U; ru) Presto/2.4.18 Version/10.00",
249 | "Opera/10.61 (J2ME/MIDP; Opera Mini/5.1.21219/19.999; en-US; rv:1.9.3a5) WebKit/534.5 Presto/2.6.30",
250 | "POLARIS/6.01 (BREW 3.1.5; U; en-us; LG; LX265; POLARIS/6.01/WAP) MMP/2.0 profile/MIDP-2.1 Configuration/CLDC-1.1",
251 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
252 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
253 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
254 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
255 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
256 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
257 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
258 | "Mozilla/5.0 (iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F190 Safari/6533.18.5",
259 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
260 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7;en-us) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17",
261 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
262 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
263 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; Galaxy Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
264 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
265 | "Mozilla/5.0 (Linux; U; Android 3.0.1; en-us; GT-P7100 Build/HRI83) AppleWebkit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
266 | "Mozilla/4.0 (compatible; Linux 2.6.22) NetFront/3.4 Kindle/2.0 (screen 600x800)",
267 | "Mozilla/5.0 (Linux U; en-US) AppleWebKit/528.5 (KHTML, like Gecko, Safari/528.5 ) Version/4.0 Kindle/3.0 (screen 600x800; rotate)",
268 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
269 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
270 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
271 | "Mozilla/5.0 (iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F190 Safari/6533.18.5",
272 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420 (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3",
273 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 2_0 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5A347 Safari/525.200",
274 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16",
275 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/531.22.7",
276 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
277 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; da-dk) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3",
278 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_2_1 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5H11a Safari/525.20",
279 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_1 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Mobile/7C145",
280 | "nook browser/1.0",
281 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7;en-us) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17",
282 | "Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; BNTV250 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Safari/533.1",
283 | "BlackBerry7100i/4.1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/103",
284 | "BlackBerry8300/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/107 UP.Link/6.2.3.15.0",
285 | "BlackBerry8320/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/100",
286 | "BlackBerry8330/4.3.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/105",
287 | "BlackBerry9000/4.6.0.167 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102",
288 | "BlackBerry9530/4.7.0.167 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102 UP.Link/6.3.1.20.0",
289 | "BlackBerry9700/5.0.0.351 Profile/MIDP-2.1 Configuration/CLDC-1.1 VendorID/123",
290 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1 (KHTML, Like Gecko) Version/6.0.0.141 Mobile Safari/534.1",
291 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
292 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
293 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
294 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
295 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
296 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
297 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
298 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
299 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
300 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
301 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
302 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
303 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-ch; HTC Sensation Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
304 | "HTC-ST7377/1.59.502.3 (67150) Opera/9.50 (Windows NT 5.1; U; en) UP.Link/6.3.1.17.0",
305 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
306 | "LG-LX550 AU-MIC-LX550/2.0 MMP/2.0 Profile/MIDP-2.0 Configuration/CLDC-1.1",
307 | "POLARIS/6.01(BREW 3.1.5;U;en-us;LG;LX265;POLARIS/6.01/WAP;)MMP/2.0 profile/MIDP-201 Configuration /CLDC-1.1",
308 | "LG-GC900/V10a Obigo/WAP2.0 Profile/MIDP-2.1 Configuration/CLDC-1.1",
309 | "Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; PPC; MDA Pro/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1)",
310 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
311 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
312 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
313 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
314 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
315 | "MOT-L7v/08.B7.5DR MIB/2.2.1 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0",
316 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
317 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
318 | "MOT-V9mm/00.62 UP.Browser/6.2.3.4.c.1.123 (GUI) MMP/2.0",
319 | "MOTORIZR-Z8/46.00.00 Mozilla/4.0 (compatible; MSIE 6.0; Symbian OS; 356) Opera 8.65 [it] UP.Link/6.3.0.0.0",
320 | "MOT-V177/0.1.75 UP.Browser/6.2.3.9.c.12 (GUI) MMP/2.0 UP.Link/6.3.1.13.0",
321 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
322 | "portalmmm/2.0 N410i(c20;TB) ",
323 | "Nokia3230/2.0 (5.0614.0) SymbianOS/7.0s Series60/2.1 Profile/MIDP-2.0 Configuration/CLDC-1.0",
324 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 Nokia5700/3.27; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
325 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 Nokia6120c/3.70; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
326 | "Nokia6230/2.0 (04.44) Profile/MIDP-2.0 Configuration/CLDC-1.1",
327 | "Nokia6230i/2.0 (03.80) Profile/MIDP-2.0 Configuration/CLDC-1.1",
328 | "Mozilla/4.1 (compatible; MSIE 5.0; Symbian OS; Nokia 6600;452) Opera 6.20 [en-US]",
329 | "Nokia6630/1.0 (2.39.15) SymbianOS/8.0 Series60/2.6 Profile/MIDP-2.0 Configuration/CLDC-1.1",
330 | "Nokia7250/1.0 (3.14) Profile/MIDP-1.0 Configuration/CLDC-1.0",
331 | "Mozilla/4.0 (compatible; MSIE 5.0; Series80/2.0 Nokia9500/4.51 Profile/MIDP-2.0 Configuration/CLDC-1.1)",
332 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaC6-01/011.010; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.2 3gpp-gba",
333 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaC7-00/012.003; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.3 3gpp-gba",
334 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es50",
335 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaE6-00/021.002; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.16 Mobile Safari/533.4 3gpp-gba",
336 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es65",
337 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaE7-00/010.016; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.3 3gpp-gba",
338 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es70",
339 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaE90-1/07.24.0.3; Profile/MIDP-2.0 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.2.3.18.0",
340 | "NokiaN70-1/5.0609.2.0.1 Series60/2.8 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.1.13.0",
341 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
342 | "NokiaN73-1/3.0649.0.0.1 Series60/3.0 Profile/MIDP2.0 Configuration/CLDC-1.1",
343 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaN8-00/014.002; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.6.4 3gpp-gba",
344 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
345 | "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
346 | "Mozilla/5.0 (SymbianOS/9.1; U; de) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
347 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaN95/10.0.018; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.3.0.0.0",
348 | "Mozilla/5.0 (MeeGo; NokiaN950-00/00) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
349 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/10.0.012; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) WicKed/7.1.12344",
350 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba",
351 | "Mozilla/5.0 (webOS/1.3; U; en-US) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/1.0 Safari/525.27.1 Desktop/1.0",
352 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; PalmSource/hspr-H102; Blazer/4.0) 16;320x320",
353 | "SEC-SGHE900/1.0 NetFront/3.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 Opera/8.01 (J2ME/MIDP; Opera Mini/2.0.4509/1378; nl; U; ssr)",
354 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; Galaxy Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
355 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
356 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-de; Galaxy S II Build/GRJ22) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
357 | "Mozilla/5.0 (Linux; U; Android 3.0.1; en-us; GT-P7100 Build/HRI83) AppleWebkit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
358 | "SAMSUNG-S8000/S8000XXIF3 SHP/VPP/R5 Jasmine/1.0 Nextreaming SMM-MMS/1.2.0 profile/MIDP-2.1 configuration/CLDC-1.1 FirePHP/0.3",
359 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; SPH-M900 Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
360 | "SAMSUNG-SGH-A867/A867UCHJ3 SHP/VPP/R5 NetFront/35 SMM-MMS/1.2.0 profile/MIDP-2.0 configuration/CLDC-1.1 UP.Link/6.3.0.0.0",
361 | "SEC-SGHX210/1.0 UP.Link/6.3.1.13.0",
362 | "Mozilla/5.0 (Linux; U; Android 1.5; fr-fr; GT-I5700 Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
363 | "SEC-SGHX820/1.0 NetFront/3.2 Profile/MIDP-2.0 Configuration/CLDC-1.1",
364 | "SonyEricssonK310iv/R4DA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.1.13.0",
365 | "SonyEricssonK550i/R1JD Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
366 | "SonyEricssonK610i/R1CB Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
367 | "SonyEricssonK750i/R1CA Browser/SEMC-Browser/4.2 Profile/MIDP-2.0 Configuration/CLDC-1.1",
368 | "Opera/9.80 (J2ME/MIDP; Opera Mini/5.0.16823/1428; U; en) Presto/2.2.0",
369 | "SonyEricssonK800i/R1CB Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0",
370 | "SonyEricssonK810i/R1KG Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
371 | "Opera/8.01 (J2ME/MIDP; Opera Mini/1.0.1479/HiFi; SonyEricsson P900; no; U; ssr)",
372 | "SonyEricssonS500i/R6BC Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
373 | "Mozilla/5.0 (SymbianOS/9.4; U; Series60/5.0 SonyEricssonP100/01; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 Safari/525",
374 | "SonyEricssonT68/R201A",
375 | "SonyEricssonT100/R101",
376 | "SonyEricssonT610/R201 Profile/MIDP-1.0 Configuration/CLDC-1.0",
377 | "SonyEricssonT650i/R7AA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
378 | "SonyEricssonW580i/R6BC Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
379 | "SonyEricssonW660i/R6AD Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
380 | "SonyEricssonW810i/R4EA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0",
381 | "SonyEricssonW850i/R1ED Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1",
382 | "SonyEricssonW950i/R100 Mozilla/4.0 (compatible; MSIE 6.0; Symbian OS; 323) Opera 8.60 [en-US]",
383 | "SonyEricssonW995/R1EA Profile/MIDP-2.1 Configuration/CLDC-1.1 UNTRUSTED/1.0",
384 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
385 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
386 | "Opera/9.5 (Microsoft Windows; PPC; Opera Mobi; U) SonyEricssonX1i/R2AA Profile/MIDP-2.0 Configuration/CLDC-1.1",
387 | "SonyEricssonZ800/R1Y Browser/SEMC-Browser/4.1 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0",
388 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; winfx; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Zune 2.0) ",
389 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)",
390 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
391 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
392 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
393 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
394 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
395 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
396 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
397 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
398 | "Mozilla/5.0 (Android; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1",
399 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
400 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
401 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-ch; HTC Sensation Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
402 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-de; Galaxy S II Build/GRJ22) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
403 | "Opera/9.80 (Android 4.0.4; Linux; Opera Mobi/ADR-1205181138; U; pl) Presto/2.10.254 Version/12.00",
404 | "Mozilla/5.0 (Android; Linux armv7l; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 Fennec/10.0.1",
405 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420 (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3",
406 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 2_0 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5A347 Safari/525.200",
407 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_2_1 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5H11a Safari/525.20",
408 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16",
409 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
410 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
411 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
412 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3 like Mac OS X; de-de) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8F190",
413 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; da-dk) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3",
414 | "Mozilla/5.0 (X11; Linux i686 on x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1",
415 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1",
416 | "Mozilla/5.0 (webOS/1.3; U; en-US) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/1.0 Safari/525.27.1 Desktop/1.0",
417 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; PalmSource/hspr-H102; Blazer/4.0) 16;320x320",
418 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaN8-00/014.002; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.6.4 3gpp-gba",
419 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba",
420 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaE90-1/07.24.0.3; Profile/MIDP-2.0 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.2.3.18.0",
421 | "Mozilla/5.0 (SymbianOS 9.4; Series60/5.0 NokiaN97-1/10.0.012; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) WicKed/7.1.12344",
422 | "Opera/9.80 (S60; SymbOS; Opera Mobi/499; U; ru) Presto/2.4.18 Version/10.00",
423 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)",
424 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; IEMobile/7.0) Asus;Galaxy6",
425 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)",
426 | "DoCoMo/2.0 SH901iC(c100;TB;W24H12)",
427 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7 MG(Novarra-Vision/6.9)",
428 | "Mozilla/4.0 (compatible; MSIE 6.0; j2me) ReqwirelessWeb/3.5",
429 | "Vodafone/1.0/V802SE/SEJ001 Browser/SEMC-Browser/4.1",
430 | "BlackBerry7520/4.0.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/5.0.3.3 UP.Link/5.1.2.12 (Google WAP Proxy/1.0)",
431 | "Nokia6100/1.0 (04.01) Profile/MIDP-1.0 Configuration/CLDC-1.0",
432 | "Nokia6630/1.0 (2.3.129) SymbianOS/8.0 Series60/2.6 Profile/MIDP-2.0 Configuration/CLDC-1.1",
433 | "Mozilla/2.0 (compatible; Ask Jeeves/Teoma)",
434 | "Baiduspider ( http://www.baidu.com/search/spider.htm)",
435 | "Mozilla/5.0 (compatible; bingbot/2.0 http://www.bing.com/bingbot.htm)",
436 | "Mozilla/5.0 (compatible; Exabot/3.0; http://www.exabot.com/go/robot) ",
437 | "FAST-WebCrawler/3.8 (crawler at trd dot overture dot com; http://www.alltheweb.com/help/webmaster/crawler)",
438 | "AdsBot-Google ( http://www.google.com/adsbot.html)",
439 | "Mozilla/5.0 (compatible; Googlebot/2.1; http://www.google.com/bot.html)",
440 | "Googlebot/2.1 ( http://www.googlebot.com/bot.html)",
441 | "Googlebot-Image/1.0",
442 | "Mediapartners-Google",
443 | "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)",
444 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)",
445 | "SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)",
446 | "Googlebot-News",
447 | "Googlebot-Video/1.0",
448 | "Mozilla/4.0 (compatible; GoogleToolbar 4.0.1019.5266-big; Windows XP 5.1; MSIE 6.0.2900.2180)",
449 | "Mozilla/5.0 (en-us) AppleWebKit/525.13 (KHTML, like Gecko; Google Web Preview) Version/3.1 Safari/525.13",
450 | "msnbot/1.0 ( http://search.msn.com/msnbot.htm)",
451 | "msnbot/1.1 ( http://search.msn.com/msnbot.htm)",
452 | "msnbot/0.11 ( http://search.msn.com/msnbot.htm)",
453 | "msnbot-media/1.1 ( http://search.msn.com/msnbot.htm)",
454 | "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
455 | "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)",
456 | "EmailWolf 1.00",
457 | "Gaisbot/3.0 (robot@gais.cs.ccu.edu.tw; http://gais.cs.ccu.edu.tw/robot.php)",
458 | "grub-client-1.5.3; (grub-client-1.5.3; Crawl your own stuff with http://grub.org)",
459 | "Gulper Web Bot 0.2.4 (www.ecsl.cs.sunysb.edu/~maxim/cgi-bin/Link/GulperBot)",
460 | "Mozilla/3.0 (compatible; NetPositive/2.1.1; BeOS)",
461 | "Mozilla/5.0 (BeOS; U; BeOS BePC; en-US; rv:1.9a1) Gecko/20060702 SeaMonkey/1.5a",
462 | "Download Demon/3.5.0.11",
463 | "Offline Explorer/2.5",
464 | "SuperBot/4.4.0.60 (Windows XP)",
465 | "WebCopier v4.6",
466 | "Web Downloader/6.9",
467 | "WebZIP/3.5 (http://www.spidersoft.com)",
468 | "Wget/1.9 cvs-stable (Red Hat modified)",
469 | "Wget/1.9.1",
470 | "Bloglines/3.1 (http://www.bloglines.com)",
471 | "everyfeed-spider/2.0 (http://www.everyfeed.com)",
472 | "FeedFetcher-Google; ( http://www.google.com/feedfetcher.html)",
473 | "Gregarius/0.5.2 ( http://devlog.gregarius.net/docs/ua)",
474 | "Mozilla/5.0 (PLAYSTATION 3; 2.00)",
475 | "Mozilla/5.0 (PLAYSTATION 3; 1.10)",
476 | "Mozilla/4.0 (PSP (PlayStation Portable); 2.00)",
477 | "Opera/9.30 (Nintendo Wii; U; ; 2047-7; en)",
478 | "wii libnup/1.0",
479 | "Java/1.6.0_13",
480 | "libwww-perl/5.820",
481 | "Peach/1.01 (Ubuntu 8.04 LTS; U; en)",
482 | "Python-urllib/2.5",
483 | "HTMLParser/1.6",
484 | "Jigsaw/2.2.5 W3C_CSS_Validator_JFouffa/2.0",
485 | "W3C_Validator/1.654",
486 | "W3C_Validator/1.305.2.12 libwww-perl/5.64",
487 | "P3P Validator",
488 | "CSSCheck/1.2.2",
489 | "WDG_Validator/1.6.2",
490 | "facebookscraper/1.0( http://www.facebook.com/sharescraper_help.php)",
491 | "grub-client-1.5.3; (grub-client-1.5.3; Crawl your own stuff with http://grub.org)",
492 | "iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)",
493 | "Microsoft URL Control - 6.00.8862",
494 | "SearchExpress",
495 | ]
496 | AGENT_GOOGLE_IMAGE = ["Googlebot-Image/1.0"]
497 | # AGENTS=["Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"]
498 | AGENTS = ["Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0"]
499 |
--------------------------------------------------------------------------------