├── tubatu ├── __init__.py ├── tubatu │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── design_topic_spider.py │ │ └── design_picture_spider.py │ ├── model │ │ ├── design_topic.py │ │ └── design_picture.py │ ├── reset.py │ ├── config.py │ ├── middlewares.py │ ├── items.py │ ├── constants.py │ ├── service │ │ ├── design_service.py │ │ ├── design_topic_service.py │ │ ├── image_service.py │ │ └── design_picture_service.py │ ├── pipelines.py │ └── settings.py ├── run.bat ├── scrapy.cfg └── run.py ├── guju ├── guju │ ├── __init__.py │ ├── model │ │ ├── __init__.py │ │ └── design_picture.py │ ├── spiders │ │ ├── __init__.py │ │ └── design_strategy_spider.py │ ├── pipelines.py │ ├── items.py │ ├── config.py │ ├── middlewares.py │ ├── constants.py │ ├── run.py │ ├── service │ │ └── design_strategy_service.py │ └── settings.py └── scrapy.cfg ├── requirements.txt ├── test ├── test_anything.py ├── test_bloom_filter_service.py ├── test_proxy_pool.py └── test_design_topic_spider.py ├── .idea ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml └── codeStyleSettings.xml ├── setup.py ├── msic ├── common │ ├── constant.py │ ├── utils.py │ ├── log.py │ └── agents.py ├── config.py ├── proxy │ ├── proxy.py │ ├── proxy_pool.py │ └── proxy_strategy.py ├── core │ └── service │ │ ├── mongodb_service.py │ │ └── bloom_filter_service.py └── scrapy │ └── middlewares.py ├── README.md └── .gitignore /tubatu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /guju/guju/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /guju/guju/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tubatu/tubatu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tubatu/run.bat: -------------------------------------------------------------------------------- 1 | py -3 run.py 2 | pause -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | selenium 3 | six 4 | pymongo 5 | pillow 6 | requests 7 | schedule 8 | beautifulsoup4 9 | redis -------------------------------------------------------------------------------- /test/test_anything.py: -------------------------------------------------------------------------------- 1 | def foo(x, y): 2 | print(x, y) 3 | 4 | 5 | alist = [1, 2] 6 | adict = {'x': 1, 'y': 2} 7 | foo(*alist) # 1, 2 8 | foo(**adict) # 1, 2 9 | -------------------------------------------------------------------------------- /guju/guju/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tubatu/tubatu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tubatu/tubatu/model/design_topic.py: -------------------------------------------------------------------------------- 1 | class DesignTopicModel(object): 2 | def __init__(self): 3 | self._id = "" 4 | self.title = "" 5 | self.description = "" 6 | self.html_url = "" 7 | self.article = {} 8 | self.create_time = "" 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /guju/guju/model/design_picture.py: -------------------------------------------------------------------------------- 1 | class DesignStrategyModel(object): 2 | def __init__(self): 3 | self.id = "" 4 | self.category = "" 5 | self.title = "" 6 | self.description = "" 7 | self.html_url = "" 8 | self.content = "" 9 | self.create_time = "" 10 | -------------------------------------------------------------------------------- /guju/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = guju.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = guju 12 | -------------------------------------------------------------------------------- /.idea/codeStyleSettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 9 | -------------------------------------------------------------------------------- /tubatu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tubatu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tubatu 12 | 13 | 14 | -------------------------------------------------------------------------------- /guju/guju/pipelines.py: -------------------------------------------------------------------------------- 1 | from guju.service.design_strategy_service import DesignStrategyService 2 | 3 | 4 | class DesignStrategyPipeline(object): 5 | def __init__(self): 6 | self.design_strategy_service = DesignStrategyService() 7 | 8 | def process_item(self, item, spider): 9 | self.design_strategy_service.handle_item(item) 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='decoration-design-crawler', 5 | version='', 6 | packages=['msic', 'tubatu', 'tubatu.tubatu', 'tubatu.tubatu.spiders'], 7 | url='', 8 | license='', 9 | author='Flyn', 10 | author_email='', 11 | description='', 12 | requires=['scrapy', 'six', 'selenium'], 13 | ) 14 | -------------------------------------------------------------------------------- /msic/common/constant.py: -------------------------------------------------------------------------------- 1 | PROTOCOL_HTTPS = "https://" 2 | PROTOCOL_HTTP = "http://" 3 | 4 | HEADERS = { 5 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36', 6 | 'Connection': 'keep-alive', 7 | 'Content-Encoding': 'gzip', 8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'} 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 博客地址: 2 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(一)](http://www.jianshu.com/p/5355b467d414) 3 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(二)](http://www.jianshu.com/p/95403d6c1305) 4 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(三)](http://www.jianshu.com/p/d0462dc6a7e0) 5 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(四)](http://www.jianshu.com/p/8c5bc23f4fec) 6 | * [Python爬虫实战-使用Scrapy框架爬取土巴兔(五)](http://www.jianshu.com/p/6345dbb1ad41) 7 | -------------------------------------------------------------------------------- /tubatu/tubatu/reset.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from config import mongodb, IMAGES_STORE 4 | 5 | from msic.config import redis_client 6 | 7 | mongodb.drop_collection("design_picture") 8 | mongodb.drop_collection("design_picture_summary") 9 | mongodb.drop_collection("design_topic") 10 | 11 | redis_client.delete('tubatu_design_topic_filter') 12 | redis_client.delete('tubatu_design_picture_filter') 13 | 14 | shutil.rmtree(IMAGES_STORE) 15 | -------------------------------------------------------------------------------- /guju/guju/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DesignStrategyItem(scrapy.Item): 12 | title = scrapy.Field() 13 | html_url = scrapy.Field() 14 | description = scrapy.Field() 15 | content = scrapy.Field() 16 | category = scrapy.Field() 17 | -------------------------------------------------------------------------------- /test/test_bloom_filter_service.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | from msic.core.service.bloom_filter_service import RedisBloomFilter 4 | 5 | REDIS_HOST = '127.0.0.1' 6 | REDIS_PORT = 6379 7 | 8 | REDIS_DATABASE_NAME = 0 9 | 10 | redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DATABASE_NAME) 11 | 12 | if __name__ == '__main__': 13 | bf = RedisBloomFilter(redis_client) 14 | print(bf.is_contains('http://xiaoguotu.to8to.com/p10482698.html', "room_design")) 15 | -------------------------------------------------------------------------------- /msic/config.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | from msic.core.service import mongodb_service 4 | 5 | MONGODB_HOST = "127.0.0.1" 6 | MONGODB_PORT = 27017 7 | 8 | DATABASE_NAME = 'common' 9 | mongodb_client = mongodb_service.get_client(MONGODB_HOST, MONGODB_PORT) 10 | mongodb = mongodb_service.get_db(mongodb_client, DATABASE_NAME) 11 | 12 | REDIS_HOST = '127.0.0.1' 13 | REDIS_PORT = 6379 14 | REDIS_DATABASE_NAME = 0 15 | 16 | # Redis 17 | redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DATABASE_NAME) 18 | -------------------------------------------------------------------------------- /guju/guju/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from requests.packages.urllib3.connectionpool import log as requests_log 4 | from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log 5 | 6 | from msic import config 7 | from msic.core.service import mongodb_service 8 | 9 | selenium_log.setLevel(logging.WARNING) 10 | requests_log.setLevel(logging.WARNING) 11 | 12 | DATABASE_NAME = "guju" 13 | 14 | # MongoDB 15 | mongodb = mongodb_service.get_db(config.mongodb_client, DATABASE_NAME) 16 | 17 | IMAGES_STORE = 'D:/scrapy' 18 | 19 | USE_PROXY = False 20 | -------------------------------------------------------------------------------- /tubatu/tubatu/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from requests.packages.urllib3.connectionpool import log as requests_log 4 | from selenium.webdriver.remote.remote_connection import LOGGER as selenium_log 5 | 6 | from msic import config 7 | from msic.core.service import mongodb_service 8 | 9 | selenium_log.setLevel(logging.WARNING) 10 | requests_log.setLevel(logging.WARNING) 11 | 12 | DATABASE_NAME = "tubatu" 13 | 14 | # MongoDB 15 | mongodb = mongodb_service.get_db(config.mongodb_client, DATABASE_NAME) 16 | 17 | IMAGES_STORE = 'C:/scrapy' 18 | 19 | USE_PROXY = True 20 | -------------------------------------------------------------------------------- /guju/guju/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | 3 | 4 | class RedirectionMiddleware(object): 5 | ERROR_COUNT = 0 6 | 7 | def process_response(self, request, response, spider: Spider): 8 | if response.status == 302 or response.status == 503: 9 | self.ERROR_COUNT += 1 10 | print('错误次数%s' % self.ERROR_COUNT) 11 | if self.ERROR_COUNT > 100: 12 | spider.close(spider, 'http status error') 13 | return response 14 | 15 | def process_exception(self, request, exception, spider): 16 | pass 17 | -------------------------------------------------------------------------------- /tubatu/tubatu/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | 3 | 4 | class RedirectionMiddleware(object): 5 | ERROR_COUNT = 0 6 | 7 | def process_response(self, request, response, spider: Spider): 8 | if response.status == 302 or response.status == 503: 9 | self.ERROR_COUNT += 1 10 | print('错误次数%s' % self.ERROR_COUNT) 11 | if self.ERROR_COUNT > 100: 12 | spider.close(spider, 'http status error') 13 | return response 14 | 15 | def process_exception(self, request, exception, spider): 16 | pass 17 | -------------------------------------------------------------------------------- /guju/guju/constants.py: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = "guju" 2 | 3 | ZONE_TYPE = {'19': '验房须知', 4 | '18': '装修合同', 5 | '17': '装修预算', 6 | '16': '装修风水', 7 | '15': '装修设计', 8 | '14': '装修要点', 9 | '20': '装修灵感', 10 | '13': '装修选材', 11 | '12': '建材安装', 12 | '11': '改拆工程', 13 | '10': '水电工程', 14 | '9': '防水工程', 15 | '8': '泥瓦工程', 16 | '7': '土木工程', 17 | '6': '油漆工程', 18 | '5': '装修污染', 19 | '4': '装修验收', 20 | '3': '家居护理', 21 | '2': '家居配饰', 22 | '1': '家电家私', 23 | } 24 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /msic/proxy/proxy.py: -------------------------------------------------------------------------------- 1 | from msic.common import utils 2 | 3 | 4 | class Proxy(object): 5 | def __init__(self): 6 | self.ip = '' 7 | self.response_speed = -1 8 | self.validity = False 9 | self.origin = '' 10 | self.create_time = '' 11 | self.update_time = '' 12 | self.failed_count = 0 13 | 14 | @staticmethod 15 | def create(ip, origin): 16 | proxy = Proxy() 17 | proxy.ip = ip 18 | proxy.origin = origin 19 | proxy.create_time = utils.get_utc_time() 20 | proxy.update_time = proxy.create_time 21 | proxy.failed_count = 0 22 | proxy.response_speed = -1 23 | proxy.validity = False 24 | return proxy 25 | -------------------------------------------------------------------------------- /tubatu/tubatu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DesignPictureItem(scrapy.Item): 12 | fid = scrapy.Field() 13 | title = scrapy.Field() 14 | sub_title = scrapy.Field() 15 | html_url = scrapy.Field() 16 | tags = scrapy.Field() 17 | description = scrapy.Field() 18 | img_url = scrapy.Field() 19 | img_width = scrapy.Field() 20 | img_height = scrapy.Field() 21 | img_name = scrapy.Field() 22 | 23 | 24 | class DesignTopicItem(scrapy.Item): 25 | title = scrapy.Field() 26 | description = scrapy.Field() 27 | html_url = scrapy.Field() 28 | article = scrapy.Field() 29 | create_time = scrapy.Field() 30 | -------------------------------------------------------------------------------- /tubatu/tubatu/model/design_picture.py: -------------------------------------------------------------------------------- 1 | class DesignPictureModel(object): 2 | def __init__(self): 3 | self.id = "" 4 | self.fid = "" 5 | self.title = "" 6 | self.sub_title = "" 7 | self.html_url = "" 8 | self.tags = [] 9 | self.description = "" 10 | self.img_url = "" 11 | self.img_width = 0 12 | self.img_height = 0 13 | self.img_name = "" # /tubatu/2016-09-01/ff5e6d6e5abafbaeb56af2b5034d83e9 14 | self.create_time = "" 15 | 16 | 17 | class DesignPictureSummaryModel(object): 18 | def __init__(self): 19 | self.id = "" 20 | self.cid = [] 21 | self.title = "" 22 | self.description = "" 23 | self.tags = [] 24 | self.html_url = "" 25 | self.create_time = "" 26 | self.update_time = "" 27 | self.cover_img_url = "" 28 | self.cover_img_width = 0 29 | self.cover_img_height = 0 30 | self.cover_img_name = "" 31 | -------------------------------------------------------------------------------- /tubatu/tubatu/constants.py: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = "tubatu" 2 | 3 | ZONE_TYPE = {'1': '客厅', '2': '卧室', '3': '餐厅', '4': '厨房', '5': '卫生间', '6': '阳台', '7': '书房', '8': '玄关', '10': '儿童房', '11': '衣帽间', '12': '花园'} 4 | STYLE_ID = {'13': '简约', '15': '现代', '4': '中式', '2': '欧式', '9': '美式', '11': '田园', '6': '新古典', '0': '混搭', '12': '地中海', '8': '东南亚', '17': '日式', 5 | '18': '宜家', 6 | '19': '北欧', '20': '简欧'} 7 | COLOR_ID = {'1': '白色', '2': '黑色', '3': '红色', '4': '黑色', '5': '绿色', '6': '橙色', '7': '粉色', '8': '蓝色', '9': '灰色', '10': '紫色', '11': '棕色', '12': '米色', 8 | '13': '彩色', '14': '原木色'} 9 | PART_ID = {'336': '背景墙', '16': '吊顶', '14': '隔断', '9': '窗帘', '340': '飘窗', '33': '榻榻米', '17': '橱柜', '343': '博古架', '333': '阁楼', '249': '隐形门', '21': '吧台', 10 | '22': '酒柜', '23': '鞋柜', '24': '衣柜', '19': '窗户', '20': '相片墙', '18': '楼梯', '359': '其他'} 11 | AREA = {'1': '60㎡以下', '2': '60-80㎡', '3': '80-100㎡', '4': '100-120㎡', '5': '120-150㎡', '6': '150㎡以上'} 12 | HX_ID = {'1': '小户型', '7': '一居', '2': '二居', '3': '三居', '4': '四居', '5': '复式', '6': '别墅', '8': '公寓', '9': 'loft'} 13 | -------------------------------------------------------------------------------- /test/test_proxy_pool.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from msic.proxy.proxy_pool import ProxyPool 4 | 5 | 6 | class TestProxyPool(unittest.TestCase): 7 | def setUp(self): 8 | self.proxy_pool = ProxyPool() 9 | 10 | def test_random_choice_proxy(self): 11 | ip = self.proxy_pool.random_choice_proxy() 12 | assert ip is not None 13 | assert not ip.strip() == '' 14 | print(ip) 15 | 16 | def test_add_failed_time(self): 17 | ip = self.proxy_pool.random_choice_proxy() 18 | # ip = '211.65.37.125:8118' 19 | self.proxy_pool.add_failed_time(ip) 20 | proxy = self.proxy_pool.collection.find_one({'ip': ip}) 21 | print(proxy) 22 | print("失败次数:%s" % proxy['failed_count']) 23 | 24 | def test_check_ip_availability_task(self): 25 | self.proxy_pool.check_ip_availability_task() 26 | 27 | def test_crawl_proxy_task(self): 28 | self.proxy_pool.crawl_proxy_task() 29 | 30 | def test_start(self): 31 | self.proxy_pool.start() 32 | 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /msic/core/service/mongodb_service.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient, errors 2 | from pymongo.collection import Collection 3 | from pymongo.database import Database 4 | 5 | from msic.common import log 6 | 7 | MAX_POOL_SIZE = 5 8 | 9 | 10 | def get_client(host: str, port: int) -> MongoClient: 11 | try: 12 | client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE) 13 | log.info("Connected successfully!!!") 14 | return client 15 | except errors.ConnectionFailure as e: 16 | log.error(e) 17 | 18 | 19 | def get_db(client: MongoClient, db_name: str) -> Database: 20 | try: 21 | db = Database(client, db_name) 22 | return db 23 | except Exception as e: 24 | log.error(e) 25 | 26 | 27 | def get_collection(db: Database, name: str) -> Collection: 28 | collection = Collection(db, name) 29 | return collection 30 | 31 | 32 | def insert(collection: Collection, data): 33 | collection.insert_one(data) 34 | 35 | 36 | if __name__ == '__main__': 37 | mongo_client = get_client(MongoClient.HOST, MongoClient.PORT) 38 | db = get_db(mongo_client, "test") 39 | collection = get_collection(db, "test1") 40 | insert(collection, {"test": "helloworld"}) 41 | -------------------------------------------------------------------------------- /msic/common/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import os 4 | import uuid 5 | 6 | import requests 7 | from requests import Response 8 | from requests.adapters import HTTPAdapter 9 | 10 | from msic.common.constant import HEADERS 11 | 12 | 13 | # 2a47d8b6-6f5b-11e6-ac9d-64006a0b51ab 14 | def get_uuid() -> str: 15 | return str(uuid.uuid1()) 16 | 17 | 18 | # 2016-08-31T09:13:22.434Z 19 | def get_utc_time() -> str: 20 | return datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" 21 | 22 | 23 | def get_md5(content: str) -> str: 24 | md5 = hashlib.md5() 25 | md5.update(content.encode('utf-8')) 26 | return md5.hexdigest() 27 | 28 | 29 | def make_dirs(path: str): 30 | if not os.path.exists(path): 31 | os.makedirs(path, exist_ok=True) 32 | 33 | 34 | def http_request(url: str, timeout=30) -> Response: 35 | session = requests.Session() 36 | session.mount('https://', HTTPAdapter(max_retries=5)) 37 | session.mount('http://', HTTPAdapter(max_retries=5)) 38 | response = session.get(url, headers=HEADERS, timeout=timeout) 39 | return response 40 | 41 | 42 | def log(content: str): 43 | print("============================= {content} ==========================".format(content=(get_utc_time() + " " + content))) 44 | -------------------------------------------------------------------------------- /msic/common/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from os.path import dirname 4 | 5 | SAVE_PATH = dirname(dirname(dirname(__file__))) 6 | 7 | logger = logging.getLogger() 8 | formatter = logging.Formatter('\n%(asctime)s - %(name)s - %(levelname)s \n%(message)s') 9 | 10 | error_handler = logging.FileHandler(SAVE_PATH + '/error.log', encoding='utf-8') 11 | error_handler.setLevel(logging.ERROR) 12 | error_handler.setFormatter(formatter) 13 | logger.addHandler(error_handler) 14 | 15 | warn_handler = logging.FileHandler(SAVE_PATH + '/warn.log', encoding='utf-8') 16 | warn_handler.setLevel(logging.WARNING) 17 | warn_handler.setFormatter(formatter) 18 | logger.addHandler(warn_handler) 19 | 20 | 21 | def handle_exception(exc_type, exc_value, exc_traceback): 22 | if issubclass(exc_type, KeyboardInterrupt): 23 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 24 | return 25 | logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback)) 26 | 27 | 28 | sys.excepthook = handle_exception 29 | 30 | 31 | def warn(msg): 32 | logger.warning(msg) 33 | 34 | 35 | def info(msg): 36 | logger.info(msg) 37 | 38 | 39 | def debug(msg): 40 | logger.debug(msg) 41 | 42 | 43 | def error(e: Exception): 44 | logger.error("Exception %s" % e) 45 | -------------------------------------------------------------------------------- /msic/core/service/bloom_filter_service.py: -------------------------------------------------------------------------------- 1 | from redis import StrictRedis 2 | 3 | 4 | class SimpleHash(object): 5 | def __init__(self, cap, seed): 6 | self.cap = cap 7 | self.seed = seed 8 | 9 | def hash(self, value): 10 | ret = 0 11 | for i in range(value.__len__()): 12 | ret += self.seed * ret + ord(value[i]) 13 | return (self.cap - 1) & ret 14 | 15 | 16 | class RedisBloomFilter(object): 17 | def __init__(self, redis_client: StrictRedis): 18 | self.bit_size = 1 << 25 19 | self.seeds = [5, 7, 11, 13, 31, 37, 61] 20 | self.redis = redis_client 21 | self.hash_dict = [] 22 | for i in range(self.seeds.__len__()): 23 | self.hash_dict.append(SimpleHash(self.bit_size, self.seeds[i])) 24 | 25 | def is_contains(self, value, key): 26 | if value is None: 27 | return False 28 | if value.__len__() == 0: 29 | return False 30 | ret = True 31 | for f in self.hash_dict: 32 | loc = f.hash(value) 33 | ret = ret & self.redis.getbit(key, loc) 34 | return ret 35 | 36 | def insert(self, value, key): 37 | for f in self.hash_dict: 38 | loc = f.hash(value) 39 | self.redis.setbit(key, loc, 1) 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /tubatu/tubatu/service/design_service.py: -------------------------------------------------------------------------------- 1 | from msic.common import log 2 | from msic.config import redis_client 3 | from msic.core.service import mongodb_service 4 | from msic.core.service.bloom_filter_service import RedisBloomFilter 5 | from tubatu import config 6 | 7 | 8 | class DesignService(object): 9 | TABLE_NAME = '' 10 | REDIS_KEY = '' 11 | 12 | def __init__(self): 13 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME) 14 | self.redis_bloom_filter = RedisBloomFilter(redis_client) 15 | 16 | def get_model(self, design_item): 17 | pass 18 | 19 | def save_to_database(self, collection, item): 20 | try: 21 | mongodb_service.insert(collection, item.__dict__) 22 | except Exception as e: 23 | log.error(e) 24 | 25 | def find_one(self, collection, condition: dict): 26 | try: 27 | return collection.find_one(condition) 28 | except Exception as e: 29 | log.error(e) 30 | 31 | def update_one(self, collection, condition: dict, value: dict): 32 | try: 33 | return collection.update_one(condition, {"$set": value}) 34 | except Exception as e: 35 | log.error(e) 36 | 37 | def is_duplicate_url(self, value: str) -> bool: 38 | return self.redis_bloom_filter.is_contains(value, self.REDIS_KEY) 39 | 40 | def insert_to_redis(self, value: str): 41 | self.redis_bloom_filter.insert(value, self.REDIS_KEY) 42 | 43 | def handle_item(self, design_item): 44 | pass 45 | -------------------------------------------------------------------------------- /tubatu/tubatu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from tubatu.service.design_picture_service import DesignPictureService 9 | from tubatu.service.design_topic_service import DesignTopicService 10 | from tubatu.service.image_service import ImageService 11 | 12 | 13 | class DesignPicturePipeline(object): 14 | def __init__(self): 15 | self.design_picture_service = DesignPictureService() 16 | 17 | def process_item(self, item, spider): 18 | img_url = item['img_url'] 19 | img_name = ImageService.generate_name(img_url) 20 | file_path = ImageService.file_path(img_name) 21 | thumb_path = ImageService.thumb_path(img_name) 22 | ImageService.download_img(img_url, file_path) 23 | ImageService.save_thumbnail(file_path, thumb_path) 24 | item['img_name'] = img_name 25 | self.design_picture_service.handle_item(item) 26 | 27 | 28 | class DesignTopicPipeline(object): 29 | def __init__(self): 30 | self.design_topic_service = DesignTopicService() 31 | 32 | def process_item(self, item, spider): 33 | article = item['article'] 34 | for part in article: 35 | img_url = part['img_url'] 36 | img_name = ImageService.generate_name(img_url) 37 | file_path = ImageService.file_path(img_name) 38 | thumb_path = ImageService.thumb_path(img_name) 39 | ImageService.download_img(img_url, file_path) 40 | ImageService.save_thumbnail(file_path, thumb_path) 41 | part['img_name'] = img_name 42 | self.design_topic_service.handle_item(item) 43 | -------------------------------------------------------------------------------- /tubatu/tubatu/service/design_topic_service.py: -------------------------------------------------------------------------------- 1 | from tubatu.items import DesignTopicItem 2 | from tubatu.model.design_topic import DesignTopicModel 3 | from tubatu.service.design_service import DesignService 4 | 5 | from msic.common import log 6 | from msic.common import utils 7 | 8 | 9 | class DesignTopicService(DesignService): 10 | TABLE_NAME = "design_topic" 11 | REDIS_KEY = "tubatu_design_topic_filter" 12 | 13 | def __init__(self): 14 | super(DesignTopicService, self).__init__() 15 | 16 | def get_model(self, design_topic_item: DesignTopicItem) -> DesignTopicModel: 17 | design_topic_model = DesignTopicModel() 18 | design_topic_model._id = utils.get_uuid() 19 | design_topic_model.title = design_topic_item['title'] 20 | design_topic_model.description = design_topic_item['description'] 21 | design_topic_model.html_url = design_topic_item['html_url'] 22 | design_topic_model.article = design_topic_item['article'] 23 | design_topic_model.create_time = utils.get_utc_time() 24 | return design_topic_model 25 | 26 | def handle_item(self, design_topic_item: DesignTopicItem): 27 | if self.is_duplicate_url(design_topic_item['html_url']): 28 | return 29 | design_topic_model = self.get_model(design_topic_item) 30 | self.save_to_database(self.collection, design_topic_model) 31 | self.insert_to_redis(design_topic_model.html_url) 32 | 33 | log.info("=========================================================================================") 34 | log.info("html_url:" + design_topic_item['html_url']) 35 | log.info("title:" + design_topic_item['title']) 36 | log.info("description:" + design_topic_item['description']) 37 | log.info("=========================================================================================") 38 | -------------------------------------------------------------------------------- /guju/guju/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import threading 4 | import time 5 | from os.path import dirname 6 | 7 | from guju.spiders.design_strategy_spider import DesignStrategySpider 8 | from schedule import Scheduler 9 | from twisted.internet import reactor 10 | 11 | from guju import config 12 | 13 | path = dirname(os.path.abspath(os.path.dirname(__file__))) 14 | sys.path.append(path) 15 | 16 | from scrapy.crawler import CrawlerProcess 17 | from scrapy.utils.project import get_project_settings 18 | from scrapy import signals 19 | from pydispatch import dispatcher 20 | 21 | 22 | class Runner(object): 23 | def __init__(self): 24 | self.is_running = False 25 | dispatcher.connect(self.pause_crawler, signals.engine_stopped) 26 | self.setting = get_project_settings() 27 | self.process = None 28 | 29 | def start_scrapy(self): 30 | self.process = CrawlerProcess(self.setting) 31 | self.crawl() 32 | reactor.run() 33 | 34 | def pause_crawler(self): 35 | self.is_running = False 36 | print("============ 爬虫已停止 ===================") 37 | 38 | def crawl(self): 39 | self.is_running = True 40 | self.process.crawl(DesignStrategySpider()) 41 | 42 | def start_proxy_pool(self): 43 | from msic.proxy.proxy_pool import proxy_pool 44 | if config.USE_PROXY: 45 | proxy_pool.start() 46 | else: 47 | proxy_pool.drop_proxy() 48 | 49 | def run(self): 50 | self.start_proxy_pool() 51 | self.start_scrapy() 52 | 53 | 54 | if __name__ == '__main__': 55 | runner = Runner() 56 | 57 | 58 | def thread_task(): 59 | def task(): 60 | if not runner.is_running: 61 | print("============ 开始重新爬取 ===================") 62 | runner.crawl() 63 | 64 | schedule = Scheduler() 65 | schedule.every(30).minutes.do(task) 66 | 67 | while True: 68 | schedule.run_pending() 69 | time.sleep(1) 70 | 71 | 72 | thread = threading.Thread(target=thread_task) 73 | thread.start() 74 | 75 | runner.run() 76 | -------------------------------------------------------------------------------- /tubatu/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import threading 4 | import time 5 | from os.path import dirname 6 | 7 | from schedule import Scheduler 8 | from twisted.internet import reactor 9 | 10 | from tubatu import config 11 | 12 | path = dirname(os.path.abspath(os.path.dirname(__file__))) 13 | sys.path.append(path) 14 | 15 | from scrapy.crawler import CrawlerProcess 16 | from scrapy.utils.project import get_project_settings 17 | from scrapy import signals 18 | from pydispatch import dispatcher 19 | from tubatu.spiders.design_picture_spider import DesignPictureSpider 20 | from tubatu.spiders.design_topic_spider import DesignTopicSpider 21 | 22 | 23 | class Runner(object): 24 | def __init__(self): 25 | self.is_running = False 26 | dispatcher.connect(self.pause_crawler, signals.engine_stopped) 27 | self.setting = get_project_settings() 28 | self.process = None 29 | 30 | def start_scrapy(self): 31 | self.process = CrawlerProcess(self.setting) 32 | self.crawl() 33 | reactor.run() 34 | 35 | def pause_crawler(self): 36 | self.is_running = False 37 | print("============ 爬虫已停止 ===================") 38 | 39 | def crawl(self): 40 | self.is_running = True 41 | self.process.crawl(DesignPictureSpider) 42 | self.process.crawl(DesignTopicSpider) 43 | 44 | def start_proxy_pool(self): 45 | from msic.proxy.proxy_pool import proxy_pool 46 | if config.USE_PROXY: 47 | proxy_pool.start() 48 | else: 49 | proxy_pool.drop_proxy() 50 | 51 | def run(self): 52 | self.start_proxy_pool() 53 | self.start_scrapy() 54 | 55 | 56 | if __name__ == '__main__': 57 | runner = Runner() 58 | 59 | 60 | def thread_task(): 61 | def task(): 62 | if not runner.is_running: 63 | print("============ 开始重新爬取 ===================") 64 | runner.crawl() 65 | 66 | schedule = Scheduler() 67 | schedule.every(30).minutes.do(task) 68 | 69 | while True: 70 | schedule.run_pending() 71 | time.sleep(1) 72 | 73 | 74 | thread = threading.Thread(target=thread_task) 75 | thread.start() 76 | 77 | runner.run() 78 | -------------------------------------------------------------------------------- /test/test_design_topic_spider.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | from os.path import dirname 5 | 6 | import requests 7 | from scrapy import Selector 8 | from scrapy.http import Response 9 | 10 | from tubatu.tubatu.items import DesignTopicItem 11 | 12 | path = dirname(os.path.abspath(os.path.dirname(__file__))) 13 | sys.path.append(path) 14 | 15 | 16 | class TestDesignTopicSpider(unittest.TestCase): 17 | def test_parse_content(self): 18 | content = requests.get('http://xiaoguotu.to8to.com/topic/11.html') 19 | response = Response('http://xiaoguotu.to8to.com/topic/11.html') 20 | response.text = content.content.decode("utf-8") 21 | selector = Selector(response) 22 | title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0] 23 | description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0] 24 | items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p') 25 | article = [] 26 | text = '' 27 | for index, item_selector in enumerate(items_selector): 28 | try: 29 | text = item_selector.xpath('span/text()').extract()[0] 30 | except IndexError: 31 | try: 32 | img_url = item_selector.xpath('img/@src').extract()[0] 33 | img_width = 0 34 | try: 35 | img_width = item_selector.xpath('img/@width').extract()[0] 36 | except IndexError: 37 | pass 38 | img_height = 0 39 | try: 40 | img_height = item_selector.xpath('img/@height').extract()[0] 41 | except IndexError: 42 | pass 43 | article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height}) 44 | except IndexError: 45 | continue 46 | design_topic_item = DesignTopicItem() 47 | design_topic_item['title'] = title 48 | design_topic_item['description'] = description 49 | design_topic_item['article'] = article 50 | design_topic_item['html_url'] = response.url 51 | return design_topic_item 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /tubatu/tubatu/service/image_service.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | from tubatu.constants import PROJECT_NAME 4 | 5 | from msic.common import utils 6 | from msic.proxy.proxy_pool import proxy_pool 7 | from tubatu import config 8 | 9 | IMAGE_SIZE = 500, 500 10 | 11 | 12 | class ImageService(object): 13 | @staticmethod 14 | def generate_name(key): 15 | create_time = utils.get_utc_time() 16 | img_name = "/" + PROJECT_NAME + "/" + create_time[0:10] + "/" + utils.get_md5(create_time + key) 17 | return img_name 18 | 19 | @staticmethod 20 | def get_file_name(image_name) -> str: 21 | name_data = image_name[1:].split("/") 22 | project_name = name_data[0] 23 | date = name_data[1] 24 | file_name = name_data[2] 25 | return "/" + project_name + "/" + date + "/" + file_name 26 | 27 | @staticmethod 28 | def file_path(image_name): 29 | file_path = ImageService.get_file_name(image_name) 30 | dir_name = file_path[0:file_path.rfind("/")] 31 | utils.make_dirs(config.IMAGES_STORE + dir_name) 32 | path = config.IMAGES_STORE + '%s_original.jpg' % file_path 33 | return path 34 | 35 | @staticmethod 36 | def thumb_path(image_name): 37 | file_path = ImageService.get_file_name(image_name) 38 | dir_name = file_path[0:file_path.rfind("/")] 39 | utils.make_dirs(config.IMAGES_STORE + dir_name) 40 | path = config.IMAGES_STORE + '%s_thumb.jpg' % file_path 41 | return path 42 | 43 | @staticmethod 44 | def download_img(img_url, file_path): 45 | proxies = None 46 | proxy = '' 47 | if config.USE_PROXY: 48 | proxy = proxy_pool.random_choice_proxy() 49 | proxies = { 50 | 'http': "http://%s" % proxy, 51 | } 52 | try: 53 | response = requests.get(img_url, stream=True, proxies=proxies) 54 | if response.status_code == 200: 55 | with open(file_path, 'wb') as f: 56 | for chunk in response.iter_content(1024): 57 | f.write(chunk) 58 | else: 59 | if config.USE_PROXY: 60 | proxy_pool.add_failed_time(proxy) 61 | except: 62 | if config.USE_PROXY: 63 | proxy_pool.add_failed_time(proxy) 64 | 65 | @staticmethod 66 | def save_thumbnail(file_path, thumb_path): 67 | image = Image.open(file_path) 68 | if thumb_path is not None: 69 | image.thumbnail(IMAGE_SIZE) 70 | image.save(thumb_path) 71 | del image 72 | -------------------------------------------------------------------------------- /msic/scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from scrapy.http import HtmlResponse 4 | from selenium import webdriver 5 | from selenium.webdriver import DesiredCapabilities 6 | 7 | from msic.common import log, agents 8 | from msic.proxy.proxy_pool import proxy_pool 9 | 10 | JAVASCRIPT = 'JAVASCRIPT' 11 | 12 | 13 | class CatchExceptionMiddleware(object): 14 | def process_response(self, request, response, spider): 15 | if response.status < 200 or response.status >= 400: 16 | try: 17 | proxy_pool.add_failed_time(request.meta['proxy'].replace('http://', '')) 18 | except KeyError: 19 | pass 20 | return response 21 | 22 | def process_exception(self, request, exception, spider): 23 | try: 24 | proxy_pool.add_failed_time(request.meta['proxy'].replace('http://', '')) 25 | except Exception: 26 | pass 27 | 28 | 29 | class CustomHttpProxyMiddleware(object): 30 | def process_request(self, request, spider): 31 | try: 32 | request.meta['proxy'] = "http://%s" % proxy_pool.random_choice_proxy() 33 | except Exception as e: 34 | log.error(e) 35 | 36 | 37 | class CustomUserAgentMiddleware(object): 38 | def process_request(self, request, spider): 39 | agent = random.choice(agents.AGENTS_ALL) 40 | request.headers['User-Agent'] = agent 41 | 42 | 43 | class JavaScriptMiddleware(object): 44 | def process_request(self, request, spider): 45 | if JAVASCRIPT in request.meta and request.meta[JAVASCRIPT] is True: 46 | driver = self.phantomjs_opened() 47 | try: 48 | driver.get(request.url) 49 | body = driver.page_source 50 | return HtmlResponse(request.url, body=body, encoding='utf-8', request=request) 51 | finally: 52 | self.phantomjs_closed(driver) 53 | 54 | def phantomjs_opened(self): 55 | capabilities = DesiredCapabilities.PHANTOMJS.copy() 56 | proxy = proxy_pool.random_choice_proxy() 57 | capabilities['proxy'] = { 58 | 'proxyType': 'MANUAL', 59 | 'ftpProxy': proxy, 60 | 'sslProxy': proxy, 61 | 'httpProxy': proxy, 62 | 'noProxy': None 63 | } 64 | # capabilities['phantomjs.cli.args'] = [ 65 | # '--proxy-auth=' + evar.get('WONDERPROXY_USER') + ':' + evar.get('WONDERPROXY_PASS') 66 | # ] 67 | driver = webdriver.PhantomJS(desired_capabilities=capabilities) 68 | driver.set_page_load_timeout(120) 69 | return driver 70 | 71 | def phantomjs_closed(self, driver): 72 | driver.quit() 73 | -------------------------------------------------------------------------------- /guju/guju/service/design_strategy_service.py: -------------------------------------------------------------------------------- 1 | from guju.items import DesignStrategyItem 2 | from guju.model.design_picture import DesignStrategyModel 3 | 4 | from guju import config 5 | from msic.common import log 6 | from msic.common import utils 7 | from msic.config import redis_client 8 | from msic.core.service import mongodb_service 9 | from msic.core.service.bloom_filter_service import RedisBloomFilter 10 | 11 | 12 | class DesignStrategyService(object): 13 | TABLE_NAME = "design_strategy" 14 | REDIS_KEY = "guju_design_strategy_filter" 15 | 16 | def __init__(self): 17 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME) 18 | self.redis_bloom_filter = RedisBloomFilter(redis_client) 19 | 20 | def is_duplicate_url(self, value: str) -> bool: 21 | return self.redis_bloom_filter.is_contains(value, self.REDIS_KEY) 22 | 23 | def insert_to_redis(self, value: str): 24 | self.redis_bloom_filter.insert(value, self.REDIS_KEY) 25 | 26 | def save_to_database(self, collection, item): 27 | try: 28 | mongodb_service.insert(collection, item.__dict__) 29 | except Exception as e: 30 | log.error(e) 31 | 32 | def handle_item(self, design_strategy_item: DesignStrategyItem): 33 | if self.is_duplicate_url(design_strategy_item['html_url']): 34 | return 35 | design_strategy_model = self.get_design_strategy_model(design_strategy_item) 36 | self.save_to_database(self.collection, design_strategy_model) 37 | self.insert_to_redis(design_strategy_model.html_url) 38 | log.info("=========================================================================================") 39 | log.info("title:" + design_strategy_item['title']) 40 | log.info("description:" + design_strategy_item['description']) 41 | log.info("category:" + design_strategy_item['category']) 42 | log.info("html_url:" + design_strategy_item['html_url']) 43 | log.info("=========================================================================================") 44 | 45 | def get_design_strategy_model(self, design_strategy_item: DesignStrategyItem) -> DesignStrategyModel: 46 | design_strategy_model = DesignStrategyModel() 47 | design_strategy_model.id = utils.get_uuid() 48 | design_strategy_model.title = design_strategy_item['title'] 49 | design_strategy_model.html_url = design_strategy_item['html_url'] 50 | design_strategy_model.description = design_strategy_item['description'] 51 | design_strategy_model.content = design_strategy_item['content'] 52 | design_strategy_model.category = design_strategy_item['category'] 53 | design_strategy_model.create_time = utils.get_utc_time() 54 | return design_strategy_model 55 | -------------------------------------------------------------------------------- /guju/guju/spiders/design_strategy_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | from guju.items import DesignStrategyItem 5 | from guju.service.design_strategy_service import DesignStrategyService 6 | from scrapy.linkextractors import LinkExtractor 7 | from scrapy.selector import Selector 8 | from scrapy.spiders import CrawlSpider 9 | from scrapy.spiders import Rule 10 | 11 | from guju import config 12 | from msic.common import constant 13 | from msic.common import log 14 | from msic.proxy.proxy_pool import proxy_pool 15 | 16 | 17 | class DesignStrategySpider(CrawlSpider): 18 | start_url_domain = 'guju.com.cn' 19 | name = 'design_strategy' 20 | allowed_domains = ['guju.com.cn'] 21 | start_urls = ['http://guju.com.cn/strategy/new'] 22 | rules = ( 23 | Rule(LinkExtractor(allow="/strategy/new/p-\d+"), follow=True, callback='parse_list'), 24 | ) 25 | custom_settings = { 26 | 'ITEM_PIPELINES': { 27 | 'guju.pipelines.DesignStrategyPipeline': 302, 28 | } 29 | } 30 | design_strategy_service = DesignStrategyService() 31 | 32 | def parse_list(self, response): 33 | selector = Selector(response) 34 | items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]') 35 | for item_selector in items_selector: 36 | id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '') 37 | # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498 38 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format( 39 | id=id) 40 | if self.design_strategy_service.is_duplicate_url(next_url): 41 | log.info("=================过滤了" + next_url + "===========") 42 | continue 43 | yield scrapy.Request(next_url, self.parse_content, meta={'id': id}) 44 | 45 | def parse_content(self, response): 46 | try: 47 | data = json.loads(response.text) 48 | except: 49 | print("-----------------------获取到json:" + response.text + "------------------------------") 50 | return 51 | try: 52 | model = data['strategyModel'] 53 | category = model['categoryName'] 54 | title = model['title'] 55 | description = model['description'] 56 | content = model['context'] 57 | 58 | design_strategy_item = DesignStrategyItem() # type: DesignStrategyItem 59 | design_strategy_item['category'] = category 60 | design_strategy_item['title'] = title 61 | design_strategy_item['description'] = description 62 | design_strategy_item['content'] = content 63 | design_strategy_item['html_url'] = response.url 64 | yield design_strategy_item 65 | except Exception as e: 66 | print("-----------------------获取到json:" + response.text + "------------------------------") 67 | log.warn("%s ( refer: %s )" % (e, response.url)) 68 | if config.USE_PROXY: 69 | proxy_pool.add_failed_time(response.meta['proxy'].replace('http://', '')) 70 | -------------------------------------------------------------------------------- /tubatu/tubatu/spiders/design_topic_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.selector import Selector 4 | from scrapy.spiders import CrawlSpider 5 | from scrapy.spiders import Rule 6 | from tubatu.items import DesignTopicItem 7 | from tubatu.service.design_topic_service import DesignTopicService 8 | 9 | from msic.common import constant 10 | 11 | 12 | class DesignTopicSpider(CrawlSpider): 13 | start_url_domain = 'xiaoguotu.to8to.com' 14 | name = 'design_topic' 15 | allowed_domains = ['to8to.com'] 16 | start_urls = ['http://xiaoguotu.to8to.com/topic/'] 17 | rules = ( 18 | Rule(LinkExtractor(allow='/topic/p_\d+.html'), follow=True, callback='parse_list', process_links='process_links'), 19 | ) 20 | custom_settings = { 21 | 'ITEM_PIPELINES': { 22 | 'tubatu.pipelines.DesignTopicPipeline': 301, 23 | } 24 | } 25 | design_topic_service = DesignTopicService() 26 | 27 | def process_links(self, links): 28 | for link in links: 29 | link.url = link.url.replace('%20', '') 30 | return links 31 | 32 | def parse_list(self, response): 33 | selector = Selector(response) 34 | items_selector = selector.xpath('//div[@class="xgt_topic"]') 35 | for item_selector in items_selector: 36 | # /topic/7334.html 37 | href = item_selector.xpath('div//a/@href').extract()[0] 38 | href = href.strip() 39 | # http://xiaoguotu.to8to.com/topic/7334.html 40 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + href) 41 | if self.design_topic_service.is_duplicate_url(next_url): 42 | continue 43 | yield scrapy.Request(next_url, self.parse_content) 44 | 45 | def parse_content(self, response): 46 | selector = Selector(response) 47 | title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0] 48 | description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0] 49 | items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p') 50 | article = [] 51 | text = '' 52 | for index, item_selector in enumerate(items_selector): 53 | try: 54 | text = item_selector.xpath('span/text()').extract()[0] 55 | except IndexError: 56 | try: 57 | img_url = item_selector.xpath('img/@src').extract()[0] 58 | img_width = 0 59 | try: 60 | img_width = item_selector.xpath('img/@width').extract()[0] 61 | except IndexError: 62 | pass 63 | img_height = 0 64 | try: 65 | img_height = item_selector.xpath('img/@height').extract()[0] 66 | except IndexError: 67 | pass 68 | article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height}) 69 | except IndexError: 70 | continue 71 | design_topic_item = DesignTopicItem() 72 | design_topic_item['title'] = title 73 | design_topic_item['description'] = description 74 | design_topic_item['article'] = article 75 | design_topic_item['html_url'] = response.url 76 | return design_topic_item 77 | -------------------------------------------------------------------------------- /guju/guju/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for guju project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | import os 12 | import sys 13 | from os.path import dirname 14 | 15 | from guju.config import USE_PROXY 16 | 17 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 18 | sys.path.append(path) 19 | 20 | BOT_NAME = 'guju' 21 | 22 | SPIDER_MODULES = ['guju.spiders'] 23 | NEWSPIDER_MODULE = 'guju.spiders' 24 | 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | # USER_AGENT = 'guju (+http://www.yourdomain.com)' 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | CONCURRENT_REQUESTS = 12 33 | REACTOR_THREADPOOL_MAXSIZE = 8 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | DOWNLOAD_DELAY = 1 39 | # The download delay setting will honor only one of: 40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | # CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | # TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | # DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | # } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'guju.middlewares.MyCustomSpiderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | DOWNLOADER_MIDDLEWARES = { 64 | 'msic.scrapy.middlewares.CustomUserAgentMiddleware': 2, 65 | 'guju.middlewares.RedirectionMiddleware': 998, 66 | } 67 | if USE_PROXY: 68 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CustomHttpProxyMiddleware'] = 1 69 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CatchExceptionMiddleware'] = 999 70 | 71 | # Enable or disable extensions 72 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 73 | # EXTENSIONS = { 74 | # 'scrapy.extensions.telnet.TelnetConsole': None, 75 | # } 76 | 77 | # Configure item pipelines 78 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 79 | ITEM_PIPELINES = { 80 | } 81 | 82 | # Enable and configure the AutoThrottle extension (disabled by default) 83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 84 | # AUTOTHROTTLE_ENABLED = True 85 | # The initial download delay 86 | # AUTOTHROTTLE_START_DELAY = 5 87 | # The maximum download delay to be set in case of high latencies 88 | # AUTOTHROTTLE_MAX_DELAY = 60 89 | # The average number of requests Scrapy should be sending in parallel to 90 | # each remote server 91 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 92 | # Enable showing throttling stats for every response received: 93 | # AUTOTHROTTLE_DEBUG = False 94 | 95 | # Enable and configure HTTP caching (disabled by default) 96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 97 | # HTTPCACHE_ENABLED = True 98 | # HTTPCACHE_EXPIRATION_SECS = 0 99 | # HTTPCACHE_DIR = 'httpcache' 100 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 101 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 102 | 103 | AJAXCRAWL_ENABLED = False 104 | IMAGES_STORE = 'D:/scrapy' 105 | 106 | LOG_ENABLED = True 107 | LOG_FORMAT = '%(asctime)s,%(msecs)d [%(name)s] %(levelname)s: %(message)s' 108 | -------------------------------------------------------------------------------- /tubatu/tubatu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tubatu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | import os 12 | import sys 13 | from os.path import dirname 14 | 15 | from tubatu.config import USE_PROXY 16 | 17 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 18 | sys.path.append(path) 19 | 20 | BOT_NAME = 'tubatu' 21 | 22 | SPIDER_MODULES = ['tubatu.spiders'] 23 | NEWSPIDER_MODULE = 'tubatu.spiders' 24 | 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | # USER_AGENT = 'tubatu (+http://www.yourdomain.com)' 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | CONCURRENT_REQUESTS = 12 33 | REACTOR_THREADPOOL_MAXSIZE = 8 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | DOWNLOAD_DELAY = 0 39 | # The download delay setting will honor only one of: 40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | # CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | # TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | # DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | # } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'tubatu.middlewares.MyCustomSpiderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | DOWNLOADER_MIDDLEWARES = { 64 | 'msic.scrapy.middlewares.CustomUserAgentMiddleware': 2, 65 | 'tubatu.middlewares.RedirectionMiddleware': 998, 66 | } 67 | 68 | if USE_PROXY: 69 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CustomHttpProxyMiddleware'] = 1 70 | DOWNLOADER_MIDDLEWARES['msic.scrapy.middlewares.CatchExceptionMiddleware'] = 999 71 | 72 | # Enable or disable extensions 73 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 74 | # EXTENSIONS = { 75 | # 'scrapy.extensions.telnet.TelnetConsole': None, 76 | # } 77 | 78 | # Configure item pipelines 79 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 80 | ITEM_PIPELINES = { 81 | } 82 | 83 | # Enable and configure the AutoThrottle extension (disabled by default) 84 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 85 | # AUTOTHROTTLE_ENABLED = True 86 | # The initial download delay 87 | # AUTOTHROTTLE_START_DELAY = 5 88 | # The maximum download delay to be set in case of high latencies 89 | # AUTOTHROTTLE_MAX_DELAY = 60 90 | # The average number of requests Scrapy should be sending in parallel to 91 | # each remote server 92 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 93 | # Enable showing throttling stats for every response received: 94 | # AUTOTHROTTLE_DEBUG = False 95 | 96 | # Enable and configure HTTP caching (disabled by default) 97 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 98 | # HTTPCACHE_ENABLED = True 99 | # HTTPCACHE_EXPIRATION_SECS = 0 100 | # HTTPCACHE_DIR = 'httpcache' 101 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 102 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 103 | 104 | 105 | 106 | AJAXCRAWL_ENABLED = False 107 | IMAGES_STORE = 'C:/scrapy' 108 | 109 | LOG_ENABLED = True 110 | LOG_FORMAT = '%(asctime)s,%(msecs)d [%(name)s] %(levelname)s: %(message)s' 111 | 112 | # MEMDEBUG_ENABLED = True 113 | # MEMDEBUG_NOTIFY = ['imflyn@163.com'] 114 | -------------------------------------------------------------------------------- /msic/proxy/proxy_pool.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | from datetime import datetime 4 | 5 | import pymongo 6 | from schedule import Scheduler 7 | 8 | from msic import config 9 | from msic.common import utils 10 | from msic.core.service import mongodb_service 11 | from msic.proxy import proxy_strategy 12 | 13 | TASK_INTERVAL = 60 14 | FAILED_COUNT_BORDER = 3 15 | MIN_PROXY_COUNT = 10 16 | 17 | REDIS_KEY_LAST_CHECK_IP_TIME = "last_check_ip_time" 18 | 19 | 20 | class ProxyPool(object): 21 | TABLE_NAME = 'proxy_pool' 22 | 23 | def __init__(self): 24 | self.redis_client = config.redis_client 25 | self.collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME) 26 | self.collection.create_index([('ip', pymongo.ASCENDING)], unique=True, sparse=True) 27 | 28 | # Singleton 29 | def __new__(cls, *args, **kwargs): 30 | if not hasattr(cls, '_instance'): 31 | org = super(ProxyPool, cls) 32 | cls._instance = org.__new__(cls, *args) 33 | return cls._instance 34 | 35 | def random_choice_proxy(self) -> str: 36 | proxy = self.collection.find().sort( 37 | [("failed_count", pymongo.ASCENDING), ("validity", pymongo.DESCENDING), ("response_speed", pymongo.ASCENDING), 38 | ("update_time", pymongo.DESCENDING)]) 39 | return proxy[0]['ip'] 40 | 41 | def add_failed_time(self, ip): 42 | proxy = self.collection.find_one({'ip': ip}) 43 | if proxy is not None: 44 | failed_count = proxy['failed_count'] + 1 45 | utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count)) 46 | if failed_count <= FAILED_COUNT_BORDER: 47 | try: 48 | self.collection.update_one({'ip': ip}, {"$set": {'update_time': utils.get_utc_time(), 'failed_count': failed_count}}) 49 | except: 50 | pass 51 | else: 52 | try: 53 | self.collection.delete_one({'ip': ip}) 54 | except: 55 | pass 56 | self.crawl_proxy_task() 57 | 58 | def crawl_proxy_task(self, check_num: bool = True): 59 | if check_num: 60 | count = self.collection.count() 61 | if count > MIN_PROXY_COUNT: 62 | return 63 | utils.log("开始抓取代理") 64 | proxy_list = proxy_strategy.crawl_proxy() 65 | utils.log("开始保存") 66 | for proxy in proxy_list: 67 | if not self.collection.find_one({'ip': proxy.ip}): 68 | self.collection.insert_one(proxy.__dict__) 69 | utils.log('保存了:' + proxy.ip) 70 | utils.log("保存结束") 71 | 72 | def check_ip_availability_task(self): 73 | last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME) 74 | now_time = datetime.utcnow().timestamp() 75 | if last_check_time is not None and (now_time - float(last_check_time)) < (TASK_INTERVAL * 60): 76 | return 77 | self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time) 78 | 79 | proxy_list = self.collection.find() 80 | for proxy in proxy_list: 81 | ip = proxy['ip'] 82 | start_time = time.time() 83 | response = utils.http_request('http://www.baidu.com', timeout=10) 84 | is_success = response.status_code == 200 85 | response.close() 86 | if not is_success: 87 | try: 88 | self.collection.delete_one({'ip': ip}) 89 | except: 90 | pass 91 | utils.log('Check ip %s FAILED' % ip) 92 | else: 93 | elapsed = round(time.time() - start_time, 4) 94 | try: 95 | self.collection.update_one({'ip': ip}, 96 | {"$set": {'update_time': utils.get_utc_time(), 'response_speed': elapsed, 'validity': True}}) 97 | except: 98 | pass 99 | utils.log('Check ip %s SUCCESS' % ip) 100 | 101 | def start(self): 102 | self.crawl_proxy_task(False) 103 | 104 | def task(): 105 | self.check_ip_availability_task() 106 | schedule = Scheduler() 107 | schedule.every(10).minutes.do(self.check_ip_availability_task) 108 | 109 | while True: 110 | schedule.run_pending() 111 | time.sleep(1) 112 | 113 | thread = threading.Thread(target=task) 114 | thread.start() 115 | 116 | def drop_proxy(self): 117 | self.collection.delete_many({}) 118 | 119 | 120 | proxy_pool = ProxyPool() 121 | -------------------------------------------------------------------------------- /tubatu/tubatu/service/design_picture_service.py: -------------------------------------------------------------------------------- 1 | from tubatu.items import DesignPictureItem 2 | from tubatu.model.design_picture import DesignPictureModel, DesignPictureSummaryModel 3 | from tubatu.service.design_service import DesignService 4 | 5 | from msic.common import log 6 | from msic.common import utils 7 | from msic.core.service import mongodb_service 8 | from tubatu import config 9 | 10 | 11 | class DesignPictureService(DesignService): 12 | TABLE_NAME = "design_picture" 13 | TABLE_NAME_SUMMARY = "design_picture_summary" 14 | REDIS_KEY = "tubatu_design_picture_filter" 15 | 16 | def __init__(self): 17 | super(DesignPictureService, self).__init__() 18 | self.summary_collection = mongodb_service.get_collection(config.mongodb, self.TABLE_NAME_SUMMARY) 19 | 20 | def get_design_picture_model(self, design_picture_item: DesignPictureItem) -> DesignPictureModel: 21 | design_picture_model = DesignPictureModel() 22 | design_picture_model.id = utils.get_uuid() 23 | design_picture_model.fid = design_picture_item['fid'] 24 | design_picture_model.title = design_picture_item['title'] 25 | design_picture_model.sub_title = design_picture_item['sub_title'] 26 | design_picture_model.html_url = design_picture_item['html_url'] 27 | design_picture_model.tags = design_picture_item['tags'] 28 | design_picture_model.description = design_picture_item['description'] 29 | design_picture_model.img_url = design_picture_item['img_url'] 30 | design_picture_model.img_width = design_picture_item['img_width'] 31 | design_picture_model.img_height = design_picture_item['img_height'] 32 | design_picture_model.img_name = design_picture_item['img_name'] 33 | design_picture_model.create_time = utils.get_utc_time() 34 | return design_picture_model 35 | 36 | def create_design_picture_summary_model(self, design_picture_model: DesignPictureModel) -> DesignPictureSummaryModel: 37 | design_picture_summary_model = DesignPictureSummaryModel() 38 | design_picture_summary_model.id = design_picture_model.fid 39 | design_picture_summary_model.cid = [design_picture_model.id] 40 | design_picture_summary_model.title = design_picture_model.title 41 | design_picture_summary_model.description = design_picture_model.description 42 | design_picture_summary_model.tags = design_picture_model.tags 43 | design_picture_summary_model.html_url = design_picture_model.html_url 44 | design_picture_summary_model.create_time = utils.get_utc_time() 45 | design_picture_summary_model.update_time = design_picture_summary_model.create_time 46 | design_picture_summary_model.cover_img_url = design_picture_model.img_url 47 | design_picture_summary_model.cover_img_width = design_picture_model.img_width 48 | design_picture_summary_model.cover_img_height = design_picture_model.img_height 49 | design_picture_summary_model.cover_img_name = design_picture_model.img_name 50 | return design_picture_summary_model 51 | 52 | def handle_item(self, design_picture_item: DesignPictureItem): 53 | if self.is_duplicate_url(design_picture_item['img_url']): 54 | return 55 | design_picture_model = self.get_design_picture_model(design_picture_item) 56 | self.save_to_database(self.collection, design_picture_model) 57 | 58 | summary_model = self.find_one(self.summary_collection, {'id': design_picture_model.fid}) 59 | if summary_model is None: 60 | summary_model = self.create_design_picture_summary_model(design_picture_model) 61 | self.save_to_database(self.summary_collection, summary_model) 62 | else: 63 | tags = list(set(summary_model['tags']).union(set(design_picture_model.tags))) 64 | summary_model['cid'].append(design_picture_model.id) 65 | self.update_one(self.summary_collection, {'id': summary_model['id']}, 66 | {'update_time': utils.get_utc_time(), 'tags': tags, 'cid': summary_model['cid']}) 67 | self.insert_to_redis(design_picture_model.img_url) 68 | 69 | log.info("=========================================================================================") 70 | log.info("title:" + design_picture_item['title']) 71 | log.info("sub_title:" + design_picture_item['sub_title']) 72 | log.info("original_width:" + design_picture_item['img_width']) 73 | log.info("original_height:" + design_picture_item['img_height']) 74 | log.info("html_url:" + design_picture_item['html_url']) 75 | log.info("img_url:" + design_picture_item['img_url']) 76 | log.info("description:" + design_picture_item['description']) 77 | log.info("tags:%s" % ','.join(map(str, design_picture_item['tags']))) 78 | log.info("=========================================================================================") 79 | -------------------------------------------------------------------------------- /msic/proxy/proxy_strategy.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from msic.common import utils 4 | from msic.proxy.proxy import Proxy 5 | import time 6 | 7 | 8 | class GetProxyStrategy(object): 9 | URL = '' 10 | 11 | def __init__(self): 12 | self.content = '' 13 | 14 | def execute(self) -> []: 15 | self.content = utils.http_request(self.URL).text 16 | 17 | 18 | class GetXiciProxyStrategy(GetProxyStrategy): 19 | SPEED = 100 20 | NAME = 'Xici' 21 | 22 | def execute(self): 23 | super(GetXiciProxyStrategy, self).execute() 24 | ip = [] 25 | soup = BeautifulSoup(self.content, 'html.parser') 26 | ip_list = soup.find('table', id='ip_list') 27 | ip_tr_list = ip_list.find_all('tr', limit=101) 28 | for index, ip_tr in enumerate(ip_tr_list): 29 | if index == 0: 30 | continue 31 | ip_td = ip_tr.find_all('td') 32 | address = '' 33 | port = '' 34 | is_high_quality = True 35 | for num, data in enumerate(ip_td): 36 | if num == 1: 37 | address = data.getText() 38 | elif num == 2: 39 | port = data.getText() 40 | elif num == 6 or num == 7: 41 | try: 42 | value = data.find('div', class_='bar').find('div').attrs['style'] # type:str 43 | is_high_quality = is_high_quality and int(value.replace('width:', '').replace('%', '')) > self.SPEED 44 | except: 45 | break 46 | elif num > 7: 47 | break 48 | if is_high_quality: 49 | ip.append(address + ':' + port) 50 | return ip 51 | 52 | 53 | class GetXiciChinaProxyStrategy(GetXiciProxyStrategy): 54 | URL = 'http://www.xicidaili.com/nn/' 55 | SPEED = 85 56 | 57 | 58 | class GetXiciForeignProxyStrategy(GetXiciProxyStrategy): 59 | URL = 'http://www.xicidaili.com/wn/' 60 | SPEED = 80 61 | 62 | 63 | class Get66ipProxyStrategy(GetProxyStrategy): 64 | NAME = '66ip' 65 | URL = 'http://www.66ip.cn/nmtq.php?getnum=800&isp=0&anonymoustype=4&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip' 66 | 67 | def execute(self): 68 | super(Get66ipProxyStrategy, self).execute() 69 | soup = BeautifulSoup(self.content, 'html.parser') 70 | ip = [] 71 | for br in soup.findAll('br'): 72 | ip.append(br.next.strip()) 73 | return ip 74 | 75 | 76 | class GetKuaidailiProxyStrategy(GetProxyStrategy): 77 | NAME = 'Kuaidaili' 78 | URL = 'http://www.kuaidaili.com/free/inha/%s/' 79 | SPEED = 5 80 | 81 | def execute(self): 82 | ip = [] 83 | for num in range(1, 10): 84 | url = self.URL % num 85 | context = utils.http_request(url).text 86 | ip = ip + self.parse(context) 87 | time.sleep(3) 88 | return ip 89 | 90 | def parse(self, content) -> []: 91 | ip = [] 92 | soup = BeautifulSoup(content, 'html.parser') 93 | ip_table = soup.find('tbody') 94 | ip_tr_list = ip_table.find_all('tr') 95 | for ip_tr in ip_tr_list: 96 | ip_td = ip_tr.find_all('td') 97 | address = '' 98 | port = '' 99 | is_high_quality = True 100 | for num, data in enumerate(ip_td): 101 | if num == 0: 102 | address = data.getText() 103 | elif num == 1: 104 | port = data.getText() 105 | elif num == 2: 106 | is_high_quality = data.getText() == '高匿名' 107 | if not is_high_quality: 108 | break 109 | elif num == 6: 110 | try: 111 | is_high_quality = is_high_quality and float(data.getText()[:-1]) < self.SPEED 112 | break 113 | except: 114 | break 115 | if is_high_quality: 116 | ip.append(address + ':' + port) 117 | return ip 118 | 119 | 120 | def crawl_proxy() -> []: 121 | proxy_list = [] 122 | 123 | def get_proxy_list(_strategy): 124 | _proxy_list = [] 125 | _ip_list = _strategy.execute() 126 | for ip in _ip_list: 127 | if ip.strip() == '': 128 | continue 129 | _proxy = Proxy.create(ip, _strategy.NAME) 130 | _proxy_list.append(_proxy) 131 | return _proxy_list 132 | 133 | proxy_list += get_proxy_list(GetKuaidailiProxyStrategy()) 134 | # proxy_list += get_proxy_list(Get66ipProxyStrategy()) 135 | # proxy_list += get_proxy_list(GetXiciChinaProxyStrategy()) 136 | # proxy_list += get_proxy_list(GetXiciForeignProxyStrategy()) 137 | return proxy_list 138 | -------------------------------------------------------------------------------- /tubatu/tubatu/spiders/design_picture_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.selector import Selector 6 | from scrapy.spiders import CrawlSpider 7 | from scrapy.spiders import Rule 8 | from tubatu.constants import ZONE_TYPE, STYLE_ID, AREA, COLOR_ID, HX_ID, PART_ID 9 | from tubatu.items import DesignPictureItem 10 | from tubatu.service.design_picture_service import DesignPictureService 11 | 12 | from msic.common import log, constant 13 | from msic.common import utils 14 | from msic.proxy.proxy_pool import proxy_pool 15 | from tubatu import config 16 | 17 | 18 | class DesignPictureSpider(CrawlSpider): 19 | start_url_domain = 'xiaoguotu.to8to.com' 20 | name = 'design_picture' 21 | allowed_domains = ['to8to.com'] 22 | start_urls = ['http://xiaoguotu.to8to.com/tuce/'] 23 | rules = ( 24 | Rule(LinkExtractor(allow="/tuce/p_\d+.html"), follow=True, callback='parse_list'), 25 | ) 26 | custom_settings = { 27 | 'ITEM_PIPELINES': { 28 | 'tubatu.pipelines.DesignPicturePipeline': 302, 29 | } 30 | } 31 | design_picture_service = DesignPictureService() 32 | 33 | def parse_list(self, response): 34 | selector = Selector(response) 35 | items_selector = selector.xpath('//div[@class="xmp_container"]//div[@class="item"]') 36 | for item_selector in items_selector: 37 | # http://xiaoguotu.to8to.com/c10037052.html 38 | cid = item_selector.xpath('div//a/@href').extract()[0][23:-6] 39 | title = item_selector.xpath('div//a/@title').extract()[0] 40 | # http://xiaoguotu.to8to.com/getxgtjson.php?a2=0&a12=&a11=10037052&a1=0 41 | next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/case/list?a2=0&a12=&a11={cid}&a1=0').format(cid=cid) 42 | yield scrapy.Request(next_url, self.parse_content, meta={'cid': cid, 'title': title}) 43 | 44 | def parse_content(self, response): 45 | uuid = utils.get_uuid() 46 | cid = response.meta['cid'] 47 | title = response.meta['title'] 48 | try: 49 | data = json.loads(response.text) 50 | except: 51 | print("-----------------------获取到json:" + response.text + "------------------------------") 52 | return 53 | data_img_list = data['dataImg'] 54 | for _data_img in data_img_list: 55 | data_album_list = _data_img['album'] 56 | for data_album in data_album_list: 57 | data_img = data_album['l'] 58 | # http://pic.to8to.com/case/1605/05/20160505_f0af86a239d0b02e9635a47ih5l1riuq_sp.jpg 59 | img_url = 'http://pic.to8to.com/case/{short_name}'.format(short_name=data_img['s']) 60 | if self.design_picture_service.is_duplicate_url(img_url): 61 | break 62 | sub_title = data_img['t'] 63 | original_width = data_img['w'] 64 | original_height = data_img['h'] 65 | tags = [] 66 | try: 67 | zoom_type = ZONE_TYPE[data_img['zid']] 68 | if zoom_type is not None or not zoom_type.strip() == '': 69 | tags.append(zoom_type) 70 | except KeyError: 71 | pass 72 | try: 73 | style_id = STYLE_ID[data_img['sid']] 74 | if style_id is not None or not style_id.strip() == '': 75 | tags.append(style_id) 76 | except KeyError: 77 | pass 78 | try: 79 | area = AREA[data_img['a']] 80 | if area is not None or not area.strip() == '': 81 | tags.append(area) 82 | except KeyError: 83 | pass 84 | try: 85 | color_id = COLOR_ID[data_img['coid']] 86 | if color_id is not None or not color_id.strip() == '': 87 | tags.append(color_id) 88 | except KeyError: 89 | pass 90 | try: 91 | house_type = HX_ID[data_img['hxid']] 92 | if house_type is not None or not house_type.strip() == '': 93 | tags.append(house_type) 94 | except KeyError: 95 | pass 96 | try: 97 | part = PART_ID[data_img['pid']] 98 | if part is not None or not part.strip() == '': 99 | tags.append(part) 100 | except KeyError: 101 | pass 102 | try: 103 | design_picture_item = DesignPictureItem() # type: DesignPictureItem 104 | design_picture_item['fid'] = uuid 105 | design_picture_item['html_url'] = response.url 106 | design_picture_item['img_url'] = img_url 107 | design_picture_item['tags'] = tags 108 | design_picture_item['title'] = title 109 | design_picture_item['sub_title'] = sub_title 110 | design_picture_item['img_width'] = str(original_width) 111 | design_picture_item['img_height'] = str(original_height) 112 | design_picture_item['description'] = design_picture_item['title'] 113 | yield design_picture_item 114 | except Exception as e: 115 | print("-----------------------获取到json:" + response.text + "------------------------------") 116 | log.warn("%s ( refer: %s )" % (e, response.url)) 117 | if config.USE_PROXY: 118 | proxy_pool.add_failed_time(response.meta['proxy'].replace('http://', '')) 119 | -------------------------------------------------------------------------------- /msic/common/agents.py: -------------------------------------------------------------------------------- 1 | AGENTS_ALL = [ 2 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 3 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 4 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 5 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 6 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 7 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 8 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 9 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 10 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 11 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 13 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 14 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 15 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 16 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 18 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 19 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 23 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 24 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 25 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 26 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 27 | "Mozilla/2.02E (Win95; U)", 28 | "Mozilla/3.01Gold (Win95; I)", 29 | "Mozilla/4.8 [en] (Windows NT 5.1; U)", 30 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 31 | "Opera/7.50 (Windows XP; U)", 32 | "Opera/7.50 (Windows ME; U) [en]", 33 | "Opera/7.51 (Windows NT 5.1; U) [en]", 34 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0", 35 | "Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.2b) Gecko/20021001 Phoenix/0.2", 36 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.23) Gecko/20090825 SeaMonkey/1.1.18", 37 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.6 (Change: )", 38 | "Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.8.0", 39 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)", 40 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8", 41 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/18.6.872.0 Safari/535.2 UNTRUSTED/1.0 3gpp-gba UNTRUSTED/1.0", 42 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 43 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 44 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 45 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 46 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", 47 | "Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/12.0", 48 | "Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1", 49 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1", 50 | "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0", 51 | "iTunes/9.0.2 (Windows; N)", 52 | "Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)", 53 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko) Maxthon/3.0.8.2 Safari/533.1", 54 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 55 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 56 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)", 57 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)", 58 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 59 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)", 60 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 61 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)", 62 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)", 63 | "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", 64 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)", 65 | "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0", 66 | "Opera/9.25 (Windows NT 6.0; U; en)", 67 | "Opera/9.80 (Windows NT 5.2; U; en) Presto/2.2.15 Version/10.10", 68 | "Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.7.39 Version/11.00", 69 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01", 70 | "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10", 71 | "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00", 72 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", 73 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", 74 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", 75 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", 76 | "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", 77 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", 78 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", 79 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", 80 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", 81 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", 82 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 83 | "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", 84 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", 85 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", 86 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", 87 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", 88 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 89 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0", 90 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0", 91 | "Mozilla/4.0 (compatible; MSIE 5.15; Mac_PowerPC)", 92 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US) AppleWebKit/125.4 (KHTML, like Gecko, Safari) OmniWeb/v563.15", 93 | "Opera/9.0 (Macintosh; PPC Mac OS X; U; en)", 94 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/125.2 (KHTML, like Gecko) Safari/85.8", 95 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/125.2 (KHTML, like Gecko) Safari/125.8", 96 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; fr-fr) AppleWebKit/312.5 (KHTML, like Gecko) Safari/312.3", 97 | "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418.8 (KHTML, like Gecko) Safari/419.3", 98 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Camino/2.2.1", 99 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre Camino/2.2a1pre", 100 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 101 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.79 Safari/537.4", 102 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", 103 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0", 104 | "iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)", 105 | "iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)", 106 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) AppleWebKit/528.16 (KHTML, like Gecko, Safari/528.16) OmniWeb/v622.8.0.112941", 107 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/528.16 (KHTML, like Gecko, Safari/528.16) OmniWeb/v622.8.0", 108 | "Opera/9.20 (Macintosh; Intel Mac OS X; U; en)", 109 | "Opera/9.64 (Macintosh; PPC Mac OS X; U; en) Presto/2.1.1", 110 | "Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.6.30 Version/10.61", 111 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.4.11; U; en) Presto/2.7.62 Version/11.00", 112 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 113 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", 114 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", 115 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27", 116 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7; en-us) AppleWebKit/534.20.8 (KHTML, like Gecko) Version/5.1 Safari/534.20.8", 117 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10", 118 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", 119 | "ELinks (0.4pre5; Linux 2.6.10-ac7 i686; 80x33)", 120 | "ELinks/0.9.3 (textmode; Linux 2.6.9-kanotix-8 i686; 127x41)", 121 | "ELinks/0.12~pre5-4", 122 | "Links/0.9.1 (Linux 2.4.24; i386;)", 123 | "Links (2.1pre15; Linux 2.4.26 i686; 158x61)", 124 | "Links (2.3pre1; Linux 2.6.38-8-generic x86_64; 170x48)", 125 | "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/0.8.12", 126 | "w3m/0.5.1", 127 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.4 (KHTML, like Gecko) Chrome/4.0.237.0 Safari/532.4 Debian", 128 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.277.0 Safari/532.8", 129 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.309.0 Safari/532.9", 130 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 131 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/540.0 (KHTML, like Gecko) Ubuntu/10.10 Chrome/9.1.0.0 Safari/540.0", 132 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.15 (KHTML, like Gecko) Chrome/10.0.613.0 Safari/534.15", 133 | "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.15 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.613.0 Chrome/10.0.613.0 Safari/534.15", 134 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.24 (KHTML, like Gecko) Ubuntu/10.10 Chromium/12.0.703.0 Chrome/12.0.703.0 Safari/534.24", 135 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.20 Safari/535.1", 136 | "Mozilla/5.0 Slackware/13.37 (X11; U; Linux x86_64; en-US) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41", 137 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.825.0 Chrome/14.0.825.0 Safari/535.1", 138 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Ubuntu/11.10 Chromium/15.0.874.120 Chrome/15.0.874.120 Safari/535.2", 139 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Epiphany/1.2.5", 140 | "Mozilla/5.0 (X11; U; Linux i586; en-US; rv:1.7.3) Gecko/20040924 Epiphany/1.4.4 (Ubuntu)", 141 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040614 Firefox/0.8", 142 | "Mozilla/5.0 (X11; U; Linux x86_64; sv-SE; rv:1.8.1.12) Gecko/20080207 Ubuntu/7.10 (gutsy) Firefox/2.0.0.12", 143 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.11) Gecko/2009060309 Ubuntu/9.10 (karmic) Firefox/3.0.11", 144 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.2) Gecko/20090803 Ubuntu/9.04 (jaunty) Shiretoko/3.5.2", 145 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.5) Gecko/20091107 Firefox/3.5.5", 146 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3) Gecko/20091020 Linux Mint/8 (Helena) Firefox/3.5.3", 147 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.9) Gecko/20100915 Gentoo Firefox/3.6.9", 148 | "Mozilla/5.0 (X11; U; Linux i686; pl-PL; rv:1.9.0.2) Gecko/20121223 Ubuntu/9.25 (jaunty) Firefox/3.8", 149 | "Mozilla/5.0 (X11; Linux i686; rv:2.0b6pre) Gecko/20100907 Firefox/4.0b6pre", 150 | "Mozilla/5.0 (X11; Linux i686 on x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 151 | "Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 152 | "Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 153 | "Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20100101 Firefox/4.2a1pre", 154 | "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/20100101 Firefox/5.0", 155 | "Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0", 156 | "Mozilla/5.0 (X11; Linux x86_64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1", 157 | "Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0", 158 | "Mozilla/5.0 (X11; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", 159 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Galeon/1.3.14", 160 | "Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.8.1.13) Gecko/20080313 Iceape/1.1.9 (Debian-1.1.9-5)", 161 | "Mozilla/5.0 (X11; U; Linux i686; pt-PT; rv:1.9.2.3) Gecko/20100402 Iceweasel/3.6.3 (like Firefox/3.6.3) GTB7.0", 162 | "Mozilla/5.0 (X11; Linux x86_64; rv:5.0) Gecko/20100101 Firefox/5.0 Iceweasel/5.0", 163 | "Mozilla/5.0 (X11; Linux i686; rv:6.0a2) Gecko/20110615 Firefox/6.0a2 Iceweasel/6.0a2", 164 | "Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)", 165 | "Mozilla/5.0 (compatible; Konqueror/3.3; Linux 2.6.8-gentoo-r3; X11;", 166 | "Mozilla/5.0 (compatible; Konqueror/3.5; Linux 2.6.30-7.dmz.1-liquorix-686; X11) KHTML/3.5.10 (like Gecko) (Debian package 4:3.5.10.dfsg.1-1 b1)", 167 | "Mozilla/5.0 (compatible; Konqueror/3.5; Linux; en_US) KHTML/3.5.6 (like Gecko) (Kubuntu)", 168 | "Mozilla/5.0 (X11; Linux x86_64; en-US; rv:2.0b2pre) Gecko/20100712 Minefield/4.0b2pre", 169 | "Mozilla/5.0 (X11; U; Linux; i686; en-US; rv:1.6) Gecko Debian/1.6-7", 170 | "MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23", 171 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1) Gecko/20061024 Firefox/2.0 (Swiftfox)", 172 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.10.1", 173 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 174 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 175 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.56 Safari/537.4", 176 | "Mozilla/4.0 (compatible; Dillo 3.0)", 177 | "Mozilla/5.0 (X11; U; Linux i686; en-us) AppleWebKit/528.5 (KHTML, like Gecko, Safari/528.5 ) lt-GtkLauncher", 178 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.16) Gecko/20120421 Gecko Firefox/11.0", 179 | "Mozilla/5.0 (X11; Linux i686; rv:12.0) Gecko/20100101 Firefox/12.0 ", 180 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1", 181 | "Mozilla/5.0 (X11; Linux i686; rv:16.0) Gecko/20100101 Firefox/16.0", 182 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Galeon/2.0.6 (Ubuntu 2.0.6-2)", 183 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.16) Gecko/20080716 (Gentoo) Galeon/2.0.6", 184 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.13) Gecko/20100916 Iceape/2.0.8", 185 | "Mozilla/5.0 (X11; Linux i686; rv:14.0) Gecko/20100101 Firefox/14.0.1 Iceweasel/14.0.1", 186 | "Mozilla/5.0 (X11; Linux x86_64; rv:15.0) Gecko/20120724 Debian Iceweasel/15.02", 187 | "Mozilla/5.0 (compatible; Konqueror/4.2; Linux) KHTML/4.2.4 (like Gecko) Slackware/13.0", 188 | "Mozilla/5.0 (compatible; Konqueror/4.3; Linux) KHTML/4.3.1 (like Gecko) Fedora/4.3.1-3.fc11", 189 | "Mozilla/5.0 (compatible; Konqueror/4.4; Linux) KHTML/4.4.1 (like Gecko) Fedora/4.4.1-1.fc12", 190 | "Mozilla/5.0 (compatible; Konqueror/4.4; Linux 2.6.32-22-generic; X11; en_US) KHTML/4.4.3 (like Gecko) Kubuntu", 191 | "Midori/0.1.10 (X11; Linux i686; U; en-us) WebKit/(531).(2) ", 192 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.3) Gecko/2008092814 (Debian-3.0.1-1)", 193 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330", 194 | "Opera/9.64 (X11; Linux i686; U; Linux Mint; nb) Presto/2.1.1", 195 | "Opera/9.80 (X11; Linux i686; U; en) Presto/2.2.15 Version/10.10", 196 | "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00", 197 | "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.34 (KHTML, like Gecko) QupZilla/1.2.0 Safari/534.34", 198 | "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.17) Gecko/20110123 SeaMonkey/2.0.12", 199 | "Mozilla/5.0 (X11; Linux i686; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", 200 | "Mozilla/5.0 (X11; U; Linux x86_64; us; rv:1.9.1.19) Gecko/20110430 shadowfox/7.0 (like Firefox/7.0", 201 | "Mozilla/5.0 (X11; U; Linux i686; it; rv:1.9.2.3) Gecko/20100406 Firefox/3.6.3 (Swiftfox)", 202 | "Uzbl (Webkit 1.3) (Linux i686 [i686])", 203 | "ELinks (0.4.3; NetBSD 3.0.2PATCH sparc64; 141x19)", 204 | "Links (2.1pre15; FreeBSD 5.3-RELEASE i386; 196x84)", 205 | "Lynx/2.8.7dev.4 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.8d", 206 | "w3m/0.5.1", 207 | "Mozilla/5.0 (X11; U; FreeBSD i386; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0", 208 | "Mozilla/5.0 (X11; U; OpenBSD i386; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.359.0 Safari/533.3", 209 | "Mozilla/5.0 (X11; U; FreeBSD x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16", 210 | "Mozilla/5.0 (X11; U; SunOS sun4m; en-US; rv:1.4b) Gecko/20030517 Mozilla Firebird/0.6", 211 | "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.6) Gecko/20040406 Galeon/1.3.15", 212 | "Mozilla/5.0 (compatible; Konqueror/3.5; NetBSD 4.0_RC3; X11) KHTML/3.5.7 (like Gecko)", 213 | "Mozilla/5.0 (compatible; Konqueror/3.5; SunOS) KHTML/3.5.1 (like Gecko)", 214 | "Mozilla/5.0 (X11; U; FreeBSD; i386; en-US; rv:1.7) Gecko", 215 | "Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)", 216 | "Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)", 217 | "Mozilla/5.0 (Unknown; U; UNIX BSD/SYSV system; C -) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.10.2", 218 | "Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/536.5 (KHTML like Gecko) Chrome/19.0.1084.56 Safari/536.5", 219 | "Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/537.4 (KHTML like Gecko) Chrome/22.0.1229.79 Safari/537.4", 220 | "Mozilla/5.0 (X11; U; OpenBSD arm; en-us) AppleWebKit/531.2 (KHTML, like Gecko) Safari/531.2 Epiphany/2.30.0", 221 | "Mozilla/5.0 (X11; U; FreeBSD amd64; en-us) AppleWebKit/531.2 (KHTML, like Gecko) Safari/531.2 Epiphany/2.30.0", 222 | "Mozilla/5.0 (X11; U; SunOS i86pc; en-US; rv:1.9.1b3) Gecko/20090429 Firefox/3.1b3", 223 | "Mozilla/5.0 (X11; U; OpenBSD i386; en-US; rv:1.9.1) Gecko/20090702 Firefox/3.5", 224 | "Mozilla/5.0 (X11; U; FreeBSD i386; de-CH; rv:1.9.2.8) Gecko/20100729 Firefox/3.6.8", 225 | "Mozilla/5.0 (X11; FreeBSD amd64; rv:5.0) Gecko/20100101 Firefox/5.0", 226 | "Mozilla/5.0 (compatible; Konqueror/4.1; DragonFly) KHTML/4.1.4 (like Gecko)", 227 | "Mozilla/5.0 (compatible; Konqueror/4.1; OpenBSD) KHTML/4.1.4 (like Gecko)", 228 | "Mozilla/5.0 (compatible; Konqueror/4.5; NetBSD 5.0.2; X11; amd64; en_US) KHTML/4.5.4 (like Gecko)", 229 | "Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)", 230 | "Mozilla/5.0 (X11; U; NetBSD amd64; en-US; rv:1.9.2.15) Gecko/20110308 Namoroka/3.6.15", 231 | "NetSurf/1.2 (NetBSD; amd64)", 232 | "Opera/9.80 (X11; FreeBSD 8.1-RELEASE i386; Edition Next) Presto/2.12.388 Version/12.10", 233 | "Mozilla/5.0 (X11; U; SunOS i86pc; en-US; rv:1.8.1.12) Gecko/20080303 SeaMonkey/1.1.8", 234 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; BOLT/2.800) AppleWebKit/534.6 (KHTML, like Gecko) Version/5.0 Safari/534.6.3", 235 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)", 236 | "Mozilla/1.22 (compatible; MSIE 5.01; PalmOS 3.0) EudoraWeb 2.1", 237 | "Mozilla/5.0 (WindowsCE 6.0; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 238 | "Mozilla/5.0 (X11; U; Linux armv61; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1", 239 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1", 240 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 Fennec/10.0.1", 241 | "Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016", 242 | "Mozilla/5.0 (X11; U; Linux armv6l; rv 1.8.1.5pre) Gecko/20070619 Minimo/0.020", 243 | "Mozilla/5.0 (X11; U; Linux arm7tdmi; rv:1.8.1.11) Gecko/20071130 Minimo/0.025", 244 | "Mozilla/4.0 (PDA; PalmOS/sony/model prmr/Revision:1.1.54 (en)) NetFront/3.0", 245 | "Opera/9.51 Beta (Microsoft Windows; PPC; Opera Mobi/1718; U; en)", 246 | "Opera/9.60 (J2ME/MIDP; Opera Mini/4.1.11320/608; U; en) Presto/2.2.0", 247 | "Opera/9.60 (J2ME/MIDP; Opera Mini/4.2.14320/554; U; cs) Presto/2.2.0", 248 | "Opera/9.80 (S60; SymbOS; Opera Mobi/499; U; ru) Presto/2.4.18 Version/10.00", 249 | "Opera/10.61 (J2ME/MIDP; Opera Mini/5.1.21219/19.999; en-US; rv:1.9.3a5) WebKit/534.5 Presto/2.6.30", 250 | "POLARIS/6.01 (BREW 3.1.5; U; en-us; LG; LX265; POLARIS/6.01/WAP) MMP/2.0 profile/MIDP-2.1 Configuration/CLDC-1.1", 251 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 252 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 253 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 254 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25", 255 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 256 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", 257 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 258 | "Mozilla/5.0 (iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F190 Safari/6533.18.5", 259 | "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25", 260 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7;en-us) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17", 261 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 262 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 263 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; Galaxy Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 264 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 265 | "Mozilla/5.0 (Linux; U; Android 3.0.1; en-us; GT-P7100 Build/HRI83) AppleWebkit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 266 | "Mozilla/4.0 (compatible; Linux 2.6.22) NetFront/3.4 Kindle/2.0 (screen 600x800)", 267 | "Mozilla/5.0 (Linux U; en-US) AppleWebKit/528.5 (KHTML, like Gecko, Safari/528.5 ) Version/4.0 Kindle/3.0 (screen 600x800; rotate)", 268 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 269 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", 270 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 271 | "Mozilla/5.0 (iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F190 Safari/6533.18.5", 272 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420 (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3", 273 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 2_0 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5A347 Safari/525.200", 274 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", 275 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/531.22.7", 276 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 277 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; da-dk) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3", 278 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_2_1 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5H11a Safari/525.20", 279 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 3_1_1 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Mobile/7C145", 280 | "nook browser/1.0", 281 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7;en-us) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Safari/530.17", 282 | "Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; BNTV250 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Safari/533.1", 283 | "BlackBerry7100i/4.1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/103", 284 | "BlackBerry8300/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/107 UP.Link/6.2.3.15.0", 285 | "BlackBerry8320/4.2.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/100", 286 | "BlackBerry8330/4.3.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/105", 287 | "BlackBerry9000/4.6.0.167 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102", 288 | "BlackBerry9530/4.7.0.167 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102 UP.Link/6.3.1.20.0", 289 | "BlackBerry9700/5.0.0.351 Profile/MIDP-2.1 Configuration/CLDC-1.1 VendorID/123", 290 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1 (KHTML, Like Gecko) Version/6.0.0.141 Mobile Safari/534.1", 291 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 292 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 293 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 294 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 295 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 296 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 297 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 298 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 299 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 300 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 301 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 302 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 303 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-ch; HTC Sensation Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", 304 | "HTC-ST7377/1.59.502.3 (67150) Opera/9.50 (Windows NT 5.1; U; en) UP.Link/6.3.1.17.0", 305 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 306 | "LG-LX550 AU-MIC-LX550/2.0 MMP/2.0 Profile/MIDP-2.0 Configuration/CLDC-1.1", 307 | "POLARIS/6.01(BREW 3.1.5;U;en-us;LG;LX265;POLARIS/6.01/WAP;)MMP/2.0 profile/MIDP-201 Configuration /CLDC-1.1", 308 | "LG-GC900/V10a Obigo/WAP2.0 Profile/MIDP-2.1 Configuration/CLDC-1.1", 309 | "Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; PPC; MDA Pro/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1)", 310 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 311 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 312 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 313 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 314 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 315 | "MOT-L7v/08.B7.5DR MIB/2.2.1 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0", 316 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 317 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 318 | "MOT-V9mm/00.62 UP.Browser/6.2.3.4.c.1.123 (GUI) MMP/2.0", 319 | "MOTORIZR-Z8/46.00.00 Mozilla/4.0 (compatible; MSIE 6.0; Symbian OS; 356) Opera 8.65 [it] UP.Link/6.3.0.0.0", 320 | "MOT-V177/0.1.75 UP.Browser/6.2.3.9.c.12 (GUI) MMP/2.0 UP.Link/6.3.1.13.0", 321 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 322 | "portalmmm/2.0 N410i(c20;TB) ", 323 | "Nokia3230/2.0 (5.0614.0) SymbianOS/7.0s Series60/2.1 Profile/MIDP-2.0 Configuration/CLDC-1.0", 324 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 Nokia5700/3.27; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413", 325 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 Nokia6120c/3.70; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413", 326 | "Nokia6230/2.0 (04.44) Profile/MIDP-2.0 Configuration/CLDC-1.1", 327 | "Nokia6230i/2.0 (03.80) Profile/MIDP-2.0 Configuration/CLDC-1.1", 328 | "Mozilla/4.1 (compatible; MSIE 5.0; Symbian OS; Nokia 6600;452) Opera 6.20 [en-US]", 329 | "Nokia6630/1.0 (2.39.15) SymbianOS/8.0 Series60/2.6 Profile/MIDP-2.0 Configuration/CLDC-1.1", 330 | "Nokia7250/1.0 (3.14) Profile/MIDP-1.0 Configuration/CLDC-1.0", 331 | "Mozilla/4.0 (compatible; MSIE 5.0; Series80/2.0 Nokia9500/4.51 Profile/MIDP-2.0 Configuration/CLDC-1.1)", 332 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaC6-01/011.010; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.2 3gpp-gba", 333 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaC7-00/012.003; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.3 3gpp-gba", 334 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es50", 335 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaE6-00/021.002; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.16 Mobile Safari/533.4 3gpp-gba", 336 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es65", 337 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaE7-00/010.016; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.7.3 3gpp-gba", 338 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es70", 339 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaE90-1/07.24.0.3; Profile/MIDP-2.0 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.2.3.18.0", 340 | "NokiaN70-1/5.0609.2.0.1 Series60/2.8 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.1.13.0", 341 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413", 342 | "NokiaN73-1/3.0649.0.0.1 Series60/3.0 Profile/MIDP2.0 Configuration/CLDC-1.1", 343 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaN8-00/014.002; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.6.4 3gpp-gba", 344 | "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413", 345 | "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13", 346 | "Mozilla/5.0 (SymbianOS/9.1; U; de) AppleWebKit/413 (KHTML, like Gecko) Safari/413", 347 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaN95/10.0.018; Profile/MIDP-2.0 Configuration/CLDC-1.1) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.3.0.0.0", 348 | "Mozilla/5.0 (MeeGo; NokiaN950-00/00) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13", 349 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/10.0.012; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) WicKed/7.1.12344", 350 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba", 351 | "Mozilla/5.0 (webOS/1.3; U; en-US) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/1.0 Safari/525.27.1 Desktop/1.0", 352 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; PalmSource/hspr-H102; Blazer/4.0) 16;320x320", 353 | "SEC-SGHE900/1.0 NetFront/3.2 Profile/MIDP-2.0 Configuration/CLDC-1.1 Opera/8.01 (J2ME/MIDP; Opera Mini/2.0.4509/1378; nl; U; ssr)", 354 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; Galaxy Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 355 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 356 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-de; Galaxy S II Build/GRJ22) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", 357 | "Mozilla/5.0 (Linux; U; Android 3.0.1; en-us; GT-P7100 Build/HRI83) AppleWebkit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 358 | "SAMSUNG-S8000/S8000XXIF3 SHP/VPP/R5 Jasmine/1.0 Nextreaming SMM-MMS/1.2.0 profile/MIDP-2.1 configuration/CLDC-1.1 FirePHP/0.3", 359 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; SPH-M900 Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 360 | "SAMSUNG-SGH-A867/A867UCHJ3 SHP/VPP/R5 NetFront/35 SMM-MMS/1.2.0 profile/MIDP-2.0 configuration/CLDC-1.1 UP.Link/6.3.0.0.0", 361 | "SEC-SGHX210/1.0 UP.Link/6.3.1.13.0", 362 | "Mozilla/5.0 (Linux; U; Android 1.5; fr-fr; GT-I5700 Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 363 | "SEC-SGHX820/1.0 NetFront/3.2 Profile/MIDP-2.0 Configuration/CLDC-1.1", 364 | "SonyEricssonK310iv/R4DA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.1.13.0", 365 | "SonyEricssonK550i/R1JD Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 366 | "SonyEricssonK610i/R1CB Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 367 | "SonyEricssonK750i/R1CA Browser/SEMC-Browser/4.2 Profile/MIDP-2.0 Configuration/CLDC-1.1", 368 | "Opera/9.80 (J2ME/MIDP; Opera Mini/5.0.16823/1428; U; en) Presto/2.2.0", 369 | "SonyEricssonK800i/R1CB Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0", 370 | "SonyEricssonK810i/R1KG Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 371 | "Opera/8.01 (J2ME/MIDP; Opera Mini/1.0.1479/HiFi; SonyEricsson P900; no; U; ssr)", 372 | "SonyEricssonS500i/R6BC Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 373 | "Mozilla/5.0 (SymbianOS/9.4; U; Series60/5.0 SonyEricssonP100/01; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 Safari/525", 374 | "SonyEricssonT68/R201A", 375 | "SonyEricssonT100/R101", 376 | "SonyEricssonT610/R201 Profile/MIDP-1.0 Configuration/CLDC-1.0", 377 | "SonyEricssonT650i/R7AA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 378 | "SonyEricssonW580i/R6BC Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 379 | "SonyEricssonW660i/R6AD Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 380 | "SonyEricssonW810i/R4EA Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0", 381 | "SonyEricssonW850i/R1ED Browser/NetFront/3.3 Profile/MIDP-2.0 Configuration/CLDC-1.1", 382 | "SonyEricssonW950i/R100 Mozilla/4.0 (compatible; MSIE 6.0; Symbian OS; 323) Opera 8.60 [en-US]", 383 | "SonyEricssonW995/R1EA Profile/MIDP-2.1 Configuration/CLDC-1.1 UNTRUSTED/1.0", 384 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 385 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 386 | "Opera/9.5 (Microsoft Windows; PPC; Opera Mobi; U) SonyEricssonX1i/R2AA Profile/MIDP-2.0 Configuration/CLDC-1.1", 387 | "SonyEricssonZ800/R1Y Browser/SEMC-Browser/4.1 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Link/6.3.0.0.0", 388 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; winfx; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Zune 2.0) ", 389 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)", 390 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 391 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 392 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 393 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 394 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 395 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 396 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 397 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 398 | "Mozilla/5.0 (Android; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1", 399 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 400 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 401 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-ch; HTC Sensation Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", 402 | "Mozilla/5.0 (Linux; U; Android 4.0.3; de-de; Galaxy S II Build/GRJ22) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", 403 | "Opera/9.80 (Android 4.0.4; Linux; Opera Mobi/ADR-1205181138; U; pl) Presto/2.10.254 Version/12.00", 404 | "Mozilla/5.0 (Android; Linux armv7l; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 Fennec/10.0.1", 405 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420 (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3", 406 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 2_0 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5A347 Safari/525.200", 407 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_2_1 like Mac OS X; en-us) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5H11a Safari/525.20", 408 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", 409 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10", 410 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; ja-jp) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 411 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 412 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3 like Mac OS X; de-de) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8F190", 413 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; da-dk) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3", 414 | "Mozilla/5.0 (X11; Linux i686 on x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1", 415 | "Mozilla/5.0 (Maemo; Linux armv7l; rv:2.0.1) Gecko/20100101 Firefox/4.0.1 Fennec/2.0.1", 416 | "Mozilla/5.0 (webOS/1.3; U; en-US) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/1.0 Safari/525.27.1 Desktop/1.0", 417 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; PalmSource/hspr-H102; Blazer/4.0) 16;320x320", 418 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaN8-00/014.002; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) Version/3.0 BrowserNG/7.2.6.4 3gpp-gba", 419 | "Mozilla/5.0 (Symbian/3; Series60/5.2 NokiaX7-00/021.004; Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/533.4 (KHTML, like Gecko) NokiaBrowser/7.3.1.21 Mobile Safari/533.4 3gpp-gba", 420 | "Mozilla/5.0 (SymbianOS/9.2; U; Series60/3.1 NokiaE90-1/07.24.0.3; Profile/MIDP-2.0 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413 UP.Link/6.2.3.18.0", 421 | "Mozilla/5.0 (SymbianOS 9.4; Series60/5.0 NokiaN97-1/10.0.012; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) WicKed/7.1.12344", 422 | "Opera/9.80 (S60; SymbOS; Opera Mobi/499; U; ru) Presto/2.4.18 Version/10.00", 423 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 6.12; Microsoft ZuneHD 4.3)", 424 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; IEMobile/7.0) Asus;Galaxy6", 425 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)", 426 | "DoCoMo/2.0 SH901iC(c100;TB;W24H12)", 427 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7 MG(Novarra-Vision/6.9)", 428 | "Mozilla/4.0 (compatible; MSIE 6.0; j2me) ReqwirelessWeb/3.5", 429 | "Vodafone/1.0/V802SE/SEJ001 Browser/SEMC-Browser/4.1", 430 | "BlackBerry7520/4.0.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/5.0.3.3 UP.Link/5.1.2.12 (Google WAP Proxy/1.0)", 431 | "Nokia6100/1.0 (04.01) Profile/MIDP-1.0 Configuration/CLDC-1.0", 432 | "Nokia6630/1.0 (2.3.129) SymbianOS/8.0 Series60/2.6 Profile/MIDP-2.0 Configuration/CLDC-1.1", 433 | "Mozilla/2.0 (compatible; Ask Jeeves/Teoma)", 434 | "Baiduspider ( http://www.baidu.com/search/spider.htm)", 435 | "Mozilla/5.0 (compatible; bingbot/2.0 http://www.bing.com/bingbot.htm)", 436 | "Mozilla/5.0 (compatible; Exabot/3.0; http://www.exabot.com/go/robot) ", 437 | "FAST-WebCrawler/3.8 (crawler at trd dot overture dot com; http://www.alltheweb.com/help/webmaster/crawler)", 438 | "AdsBot-Google ( http://www.google.com/adsbot.html)", 439 | "Mozilla/5.0 (compatible; Googlebot/2.1; http://www.google.com/bot.html)", 440 | "Googlebot/2.1 ( http://www.googlebot.com/bot.html)", 441 | "Googlebot-Image/1.0", 442 | "Mediapartners-Google", 443 | "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)", 444 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)", 445 | "SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)", 446 | "Googlebot-News", 447 | "Googlebot-Video/1.0", 448 | "Mozilla/4.0 (compatible; GoogleToolbar 4.0.1019.5266-big; Windows XP 5.1; MSIE 6.0.2900.2180)", 449 | "Mozilla/5.0 (en-us) AppleWebKit/525.13 (KHTML, like Gecko; Google Web Preview) Version/3.1 Safari/525.13", 450 | "msnbot/1.0 ( http://search.msn.com/msnbot.htm)", 451 | "msnbot/1.1 ( http://search.msn.com/msnbot.htm)", 452 | "msnbot/0.11 ( http://search.msn.com/msnbot.htm)", 453 | "msnbot-media/1.1 ( http://search.msn.com/msnbot.htm)", 454 | "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", 455 | "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)", 456 | "EmailWolf 1.00", 457 | "Gaisbot/3.0 (robot@gais.cs.ccu.edu.tw; http://gais.cs.ccu.edu.tw/robot.php)", 458 | "grub-client-1.5.3; (grub-client-1.5.3; Crawl your own stuff with http://grub.org)", 459 | "Gulper Web Bot 0.2.4 (www.ecsl.cs.sunysb.edu/~maxim/cgi-bin/Link/GulperBot)", 460 | "Mozilla/3.0 (compatible; NetPositive/2.1.1; BeOS)", 461 | "Mozilla/5.0 (BeOS; U; BeOS BePC; en-US; rv:1.9a1) Gecko/20060702 SeaMonkey/1.5a", 462 | "Download Demon/3.5.0.11", 463 | "Offline Explorer/2.5", 464 | "SuperBot/4.4.0.60 (Windows XP)", 465 | "WebCopier v4.6", 466 | "Web Downloader/6.9", 467 | "WebZIP/3.5 (http://www.spidersoft.com)", 468 | "Wget/1.9 cvs-stable (Red Hat modified)", 469 | "Wget/1.9.1", 470 | "Bloglines/3.1 (http://www.bloglines.com)", 471 | "everyfeed-spider/2.0 (http://www.everyfeed.com)", 472 | "FeedFetcher-Google; ( http://www.google.com/feedfetcher.html)", 473 | "Gregarius/0.5.2 ( http://devlog.gregarius.net/docs/ua)", 474 | "Mozilla/5.0 (PLAYSTATION 3; 2.00)", 475 | "Mozilla/5.0 (PLAYSTATION 3; 1.10)", 476 | "Mozilla/4.0 (PSP (PlayStation Portable); 2.00)", 477 | "Opera/9.30 (Nintendo Wii; U; ; 2047-7; en)", 478 | "wii libnup/1.0", 479 | "Java/1.6.0_13", 480 | "libwww-perl/5.820", 481 | "Peach/1.01 (Ubuntu 8.04 LTS; U; en)", 482 | "Python-urllib/2.5", 483 | "HTMLParser/1.6", 484 | "Jigsaw/2.2.5 W3C_CSS_Validator_JFouffa/2.0", 485 | "W3C_Validator/1.654", 486 | "W3C_Validator/1.305.2.12 libwww-perl/5.64", 487 | "P3P Validator", 488 | "CSSCheck/1.2.2", 489 | "WDG_Validator/1.6.2", 490 | "facebookscraper/1.0( http://www.facebook.com/sharescraper_help.php)", 491 | "grub-client-1.5.3; (grub-client-1.5.3; Crawl your own stuff with http://grub.org)", 492 | "iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)", 493 | "Microsoft URL Control - 6.00.8862", 494 | "SearchExpress", 495 | ] 496 | AGENT_GOOGLE_IMAGE = ["Googlebot-Image/1.0"] 497 | # AGENTS=["Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"] 498 | AGENTS = ["Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0"] 499 | --------------------------------------------------------------------------------