├── baidu ├── format.json ├── format.txt ├── sys │ ├── timeout_urls │ ├── xici_proxy │ └── proxy_ip_port ├── baidu_spider.py ├── baidu_tieba.py ├── out │ ├── 30 │ ├── 50 │ ├── 52 │ └── 115 ├── _gevent.py ├── xici_proxy.py ├── tst_bd.py └── filter_proxy.py ├── mop ├── hot_topic_urls ├── mop.py └── sys │ ├── hot_topic_urls │ ├── msg_urls │ └── comment_urls ├── jd ├── auto │ ├── log │ │ ├── crawled_page_url │ │ └── failed_page_url │ └── sys │ │ └── auto_item_id ├── electronic_jd │ ├── nohup.out │ ├── log │ │ ├── crawled_page_url │ │ ├── timeout_page_url │ │ └── elec_failed_url │ ├── jiadian_jd.py │ └── electronic_name.py ├── sports │ ├── sys │ │ └── sports_item_id │ └── log │ │ ├── crawled_page_url │ │ └── failed_page_url ├── computer │ ├── log │ │ ├── crawled_page_url │ │ ├── failed_id_url │ │ ├── timeout_page_url │ │ └── crawled_id_url │ ├── sys │ │ └── computer_item_id │ └── out │ │ └── 2014_06_03_16_comnputer ├── baby_toys │ ├── baby │ │ ├── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ │ └── sys │ │ │ └── baby_item_id │ └── toys │ │ ├── log │ │ ├── crawled_page_url │ │ └── failed_page_url │ │ └── sys │ │ └── toys_item_id ├── shoes_bag_watch │ ├── bag │ │ ├── sys │ │ │ └── bag_item_id │ │ └── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ ├── shoes │ │ ├── sys │ │ │ └── shoes_item_id │ │ └── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ └── watch │ │ ├── sys │ │ └── watch_item_id │ │ └── log │ │ ├── crawled_page_url │ │ └── failed_page_url ├── __init__.py ├── book_jd │ ├── __init__.py │ ├── xici_proxy.pyc │ ├── book_name.py │ ├── xici_proxy.py │ └── book_jd.py ├── clothing_jewellery │ ├── clothing │ │ ├── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ │ └── sys │ │ │ └── clothing_item_id │ └── jewellery │ │ ├── log │ │ ├── crawled_page_url │ │ └── failed_page_url │ │ └── sys │ │ └── jewellery_item_id ├── food_wine_freshfood │ └── freshfood │ │ ├── log │ │ ├── crawled_page_url │ │ └── failed_page_url │ │ └── sys │ │ └── freshfood_item_id ├── phone_digital │ └── digital │ │ └── __init__.py ├── home_furniture_decoration_kitchenware │ ├── decoration │ │ ├── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ │ └── sys │ │ │ └── decoration_item_id │ ├── furniture │ │ ├── log │ │ │ ├── crawled_page_url │ │ │ └── failed_page_url │ │ └── sys │ │ │ └── furniture_item_id │ └── kitchenware │ │ └── log │ │ ├── failed_page_url │ │ └── crawled_page_url ├── config.ini ├── xici_proxy ├── read_failed_page_url.py ├── varify_proxy.py ├── read_config.py └── read_failed_item_id.py ├── __init__.py ├── baidu_word_freq ├── baidu_word_freq ├── sys │ ├── HZOut.txt │ └── word_without_freq_0528.txt ├── log │ └── timeout_url_0528 ├── .idea │ ├── scopes │ │ └── scope_settings.xml │ ├── encodings.xml │ ├── vcs.xml │ ├── misc.xml │ ├── modules.xml │ ├── baidu_word_freq.iml │ └── workspace.xml ├── multi_thread_get_freq.py ├── clear_data.py ├── get_word_freq.py └── tst_freq.py ├── dangdang ├── food │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ └── food_item_id ├── life │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ └── life_item_id ├── appliance │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ ├── appliance_item_id │ │ └── whole_page_url ├── clothing │ └── log │ │ ├── failed_page_url │ │ └── crawled_page_url ├── computer │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ └── computer_item_id ├── sports │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ └── clothing_item_id ├── baby_kid │ ├── baby │ │ ├── log │ │ │ ├── failed_page_url │ │ │ └── crawled_page_url │ │ └── sys │ │ │ └── baby_item_id │ └── kid │ │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ │ └── sys │ │ └── kid_item_id ├── book │ ├── log │ │ ├── crawled_failed_page_url │ │ ├── crawled_page_url │ │ └── failed_page_url │ ├── __init__.py │ └── sys │ │ ├── topic_url │ │ └── item_id_url ├── beauty_adult │ ├── adult │ │ ├── log │ │ │ ├── failed_page_url │ │ │ └── crawled_page_url │ │ └── sys │ │ │ └── adult_item_id │ └── beauty │ │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ │ └── sys │ │ └── beauty_item_id ├── living_auto │ ├── auto │ │ ├── log │ │ │ ├── failed_page_url │ │ │ └── crawled_page_url │ │ └── sys │ │ │ └── auto_item_id │ └── living │ │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ │ └── sys │ │ └── living_item_id ├── shoes_bag │ └── shoes │ │ └── log │ │ ├── failed_page_url │ │ └── crawled_page_url ├── decoration_health │ └── health │ │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ │ └── sys │ │ └── health_item_id ├── mobile_digital │ ├── digital │ │ ├── log │ │ │ ├── failed_page_url │ │ │ └── crawled_page_url │ │ └── sys │ │ │ └── digital_item_id │ └── mobile │ │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ │ └── sys │ │ └── mobile_item_id └── jewellery_glass_watch │ ├── glass_watch │ ├── log │ │ ├── failed_page_url │ │ └── crawled_page_url │ └── sys │ │ └── glass_watch_item_id │ └── jewellery │ ├── log │ ├── failed_page_url │ └── crawled_page_url │ └── sys │ └── jewellery_item_id ├── yhd ├── food_milk │ ├── log │ │ └── failed_page_url │ ├── __init__.py │ ├── tst_rewrite.py │ └── remove_repeat_ids.py └── __init__.py ├── douban ├── __init__.py ├── douban_book │ ├── __init__.py │ ├── readme.md │ ├── conf │ │ └── mysql.json │ ├── result.py │ ├── config.json │ ├── db │ │ ├── douban_book.sql │ │ └── load_config.py │ ├── result_db.py │ └── douban_books.py ├── douban.py ├── sys │ ├── xici_proxy │ └── douban_crawled_urls ├── movie_html.py ├── xici_proxy.py ├── douban_single_thread.py ├── movie_actors.py └── movie_actors_single_thread.py ├── sohu ├── __init__.py ├── sohu_spider.pyc ├── log │ └── sohu_url_crawled.txt ├── sohu.py └── sys │ ├── hot_topic_url │ └── realtime_page_url ├── proxy ├── __init__.py ├── sys │ └── proxy_ip_port ├── you.py ├── proxy.py ├── tst_proxy.py └── xici.py ├── README.md ├── requirements.txt ├── music ├── music.py ├── tst_music.py └── baidu_music.py └── sina ├── zip_file.py └── sina_news.py /baidu/format.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baidu/format.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mop/hot_topic_urls: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baidu/sys/timeout_urls: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/auto/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/auto/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/auto/sys/auto_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/electronic_jd/nohup.out: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/sports/sys/sports_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- -------------------------------------------------------------------------------- /baidu_word_freq/baidu_word_freq: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/food/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/life/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/computer/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/computer/log/failed_id_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/computer/log/timeout_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/sports/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/sports/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /yhd/food_milk/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/appliance/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/clothing/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/computer/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/sports/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/baby_toys/baby/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/baby_toys/baby/sys/baby_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/baby_toys/toys/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/baby_toys/toys/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/baby_toys/toys/sys/toys_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/electronic_jd/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/electronic_jd/log/timeout_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/bag/sys/bag_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/baby_kid/baby/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/baby_kid/kid/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/book/log/crawled_failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /douban/__init__.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- -------------------------------------------------------------------------------- /jd/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /jd/computer/sys/computer_item_id: -------------------------------------------------------------------------------- 1 | 1079594 -------------------------------------------------------------------------------- /jd/shoes_bag_watch/bag/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/bag/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/shoes/sys/shoes_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/watch/sys/watch_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sohu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /yhd/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /baidu/baidu_spider.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /dangdang/beauty_adult/adult/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/beauty_adult/beauty/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/living_auto/auto/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/living_auto/living/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/shoes_bag/shoes/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/shoes/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/shoes/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/watch/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/shoes_bag_watch/watch/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /proxy/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /baidu/baidu_tieba.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | -------------------------------------------------------------------------------- /dangdang/book/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /dangdang/decoration_health/health/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/mobile_digital/digital/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/mobile_digital/mobile/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /douban/douban_book/__init__.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- -------------------------------------------------------------------------------- /jd/book_jd/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/clothing/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/clothing/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/clothing/sys/clothing_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/jewellery/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/jewellery/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/clothing_jewellery/jewellery/sys/jewellery_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/food_wine_freshfood/freshfood/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/food_wine_freshfood/freshfood/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /yhd/food_milk/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /jd/electronic_jd/jiadian_jd.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /jd/food_wine_freshfood/freshfood/sys/freshfood_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/jewellery_glass_watch/glass_watch/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/jewellery_glass_watch/jewellery/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/phone_digital/digital/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | -------------------------------------------------------------------------------- /jd/computer/log/crawled_id_url: -------------------------------------------------------------------------------- 1 | http://item.jd.com/1079594.html 2 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/decoration/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/decoration/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/furniture/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/furniture/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/furniture/sys/furniture_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/kitchenware/log/failed_page_url: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/decoration/sys/decoration_item_id: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dangdang/food/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4005726-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/computer/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4002590-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/sports/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4002385-pg1.html 2 | -------------------------------------------------------------------------------- /douban/douban_book/readme.md: -------------------------------------------------------------------------------- 1 | #豆瓣图书爬虫 2 | 3 | ###抓取数据内容:: 4 | * 书名,图书id,作者,初版年,评分,评论人数等 -------------------------------------------------------------------------------- /dangdang/baby_kid/baby/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4002055-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/baby_kid/kid/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009359-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/book/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cp01.05.16.00.00.00.html 2 | 3 | -------------------------------------------------------------------------------- /sohu/sohu_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/sohu/sohu_spider.pyc -------------------------------------------------------------------------------- /dangdang/beauty_adult/adult/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009498-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/beauty_adult/beauty/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4003626-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/living_auto/auto/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4006001-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/living_auto/living/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009489-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/mobile_digital/digital/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4004279-pg1.html 2 | -------------------------------------------------------------------------------- /jd/book_jd/xici_proxy.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/jd/book_jd/xici_proxy.pyc -------------------------------------------------------------------------------- /baidu_word_freq/sys/HZOut.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/sys/HZOut.txt -------------------------------------------------------------------------------- /dangdang/jewellery_glass_watch/glass_watch/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4004292-pg1.html 2 | -------------------------------------------------------------------------------- /dangdang/jewellery_glass_watch/jewellery/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009624-pg1.html 2 | -------------------------------------------------------------------------------- /jd/baby_toys/baby/log/failed_page_url: -------------------------------------------------------------------------------- 1 | not match id_str in url;http://list.jd.com/list.html?cat=1319,1523,7052&page=1 2 | -------------------------------------------------------------------------------- /baidu_word_freq/log/timeout_url_0528: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/log/timeout_url_0528 -------------------------------------------------------------------------------- /dangdang/life/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4003357-pg1.html 2 | http://category.dangdang.com/cid4003359-pg1.html 3 | -------------------------------------------------------------------------------- /dangdang/appliance/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009643-pg1.html 2 | http://category.dangdang.com/cid4009643-pg2.html 3 | -------------------------------------------------------------------------------- /baidu_word_freq/sys/word_without_freq_0528.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/sys/word_without_freq_0528.txt -------------------------------------------------------------------------------- /dangdang/shoes_bag/shoes/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4001870-pg1.html 2 | http://category.dangdang.com/cid4001870-pg2.html 3 | -------------------------------------------------------------------------------- /dangdang/mobile_digital/mobile/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4001123-pg1.html 2 | http://category.dangdang.com/cid4001123-pg2.html 3 | -------------------------------------------------------------------------------- /douban/douban_book/conf/mysql.json: -------------------------------------------------------------------------------- 1 | { 2 | "host":"127.0.0.1", 3 | "port":"3306", 4 | "database":"resultdb", 5 | "user":"root", 6 | "passwd":"root" 7 | } -------------------------------------------------------------------------------- /dangdang/decoration_health/health/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4009498-pg1.html 2 | http://category.dangdang.com/cid4009498-pg2.html 3 | -------------------------------------------------------------------------------- /proxy/sys/proxy_ip_port: -------------------------------------------------------------------------------- 1 | 1.179.147.2:8080 2 | 1.230.127.54:8080 3 | 5.101.130.95:80 4 | 5.223.112.253:8080 5 | 14.1.43.82:80 6 | 14.18.17.166:80 7 | 14.18.242.147:8080 8 | -------------------------------------------------------------------------------- /dangdang/clothing/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://category.dangdang.com/cid4008149-pg1.html 2 | http://category.dangdang.com/cid4008149-pg2.html 3 | http://category.dangdang.com/cid4008149-pg3.html 4 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /sohu/log/sohu_url_crawled.txt: -------------------------------------------------------------------------------- 1 | 2014-04-24 12:32:26,713 - sohu - DEBUG - url crawled success in http://club.news.sohu.com/minjian/threads/p1?type=all&order=rtime 2 | 2014-04-25 09:02:50,783 - sohu - DEBUG - url crawled success in http://club.news.sohu.com/minjian/threads/p1?type=all&order=rtime 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 公网爬虫系统 2 | 3 | * 电商商品信息抓取: 4 | * 京东 5 | * 国美 6 | * 一号店 7 | * 当当 8 | * 论坛内容: 9 | * 猫扑 10 | * 搜狐 11 | * IP代理(公网免费代理ip) 12 | * 新浪爬虫 13 | * 每日实时新闻 14 | * 全微博数据抓取 15 | * 全网微博评论数据抓取 16 | * 百度音乐爬虫 17 | * 豆瓣电影 18 | * 电影(最新以及历史电影) 19 | * 图书 20 | -------------------------------------------------------------------------------- /jd/computer/out/2014_06_03_16_comnputer: -------------------------------------------------------------------------------- 1 | 华硕(ASUS) R409VC 14英寸笔记本 (i5-3230M 4G 7200转500G GT720M 2G独显 D刻 深灰色) 2 | 商品名称:华硕R409VC 3 | 商品编号:1079594 4 | 品牌:华硕(ASUS) 5 | 上架时间:2014-03-14 09:52:55 6 | 商品毛重:3.3kg 7 | 商品产地:中国大陆 8 | 显卡:性能级独显触控:非触控厚度:正常厚度(>25mm)处理器:Intel i5尺寸:14英寸 9 | 10 | 11 | 如果您发现商品信息不准确,欢迎纠错 12 | -------------------------------------------------------------------------------- /mop/mop.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | from apscheduler.scheduler import Scheduler 4 | 5 | from mop_crawler import MopCrawl 6 | 7 | scheduler = Scheduler(daemonic = False) 8 | 9 | @scheduler.cron_schedule(hour=23,minute='55') 10 | def mop_spider(): 11 | mop = MopCrawl() 12 | mop.main() 13 | 14 | scheduler.start() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | Flask 3 | gevent 4 | gunicorn 5 | flask-redis 6 | pymongo 7 | MySQL-python==1.2.5 8 | redis 9 | SQLAlchemy 10 | Crypto 11 | pycrypto 12 | flask_sqlalchemy 13 | happybase==1.1.0 14 | pyexecjs 15 | beautifulsoup4 16 | lxml 17 | chardet 18 | pyDes 19 | Pillow 20 | pyspider==0.3.9 21 | mysql-connector==2.1.4 -------------------------------------------------------------------------------- /music/music.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | from apscheduler.scheduler import Scheduler 4 | 5 | from baidu_music import BaiduMusic 6 | 7 | scheduler = Scheduler(daemonic = False) 8 | 9 | @scheduler.cron_schedule(hour=23,minute='20') 10 | def music_spider(): 11 | music = BaiduMusic() 12 | music.main() 13 | 14 | scheduler.start() 15 | -------------------------------------------------------------------------------- /sohu/sohu.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | from apscheduler.scheduler import Scheduler 4 | 5 | from sohu_spider import SohuSpider 6 | 7 | scheduler = Scheduler(daemonic = False) 8 | 9 | @scheduler.cron_schedule(hour=23, minute='55') 10 | def sohu_crawler(): 11 | sohu = SohuSpider() 12 | sohu.main() 13 | scheduler.start() 14 | 15 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /douban/douban.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | from apscheduler.scheduler import Scheduler 4 | 5 | from douban_spider import DoubanCrawler 6 | 7 | scheduler = Scheduler(daemonic = False) 8 | 9 | @scheduler.cron_schedule(hour=22, minute='55') 10 | def douban_crawler(): 11 | douban = DoubanCrawler() 12 | douban.main() 13 | scheduler.start() 14 | 15 | -------------------------------------------------------------------------------- /douban/douban_book/result.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- 2 | import logging 3 | from pyspider.result.result_worker import OneResultWorker 4 | # from result_db import db 5 | 6 | logger = logging.getLogger("ResultWorker") 7 | 8 | class DoubanResultWorker(OneResultWorker): 9 | 10 | def on_result(self, task, result): 11 | if not result: 12 | return 13 | # db._replace(**result) 14 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/baidu_word_freq.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /baidu/out/115: -------------------------------------------------------------------------------- 1 | {"url": "http://baike.baidu.com/view/115.htm", "header": "\u91c7\u4fdd\u8d39", "content": "\u91c7\u8d2d\u53ca\u4fdd\u7ba1\u8d39\u7684\u7b80\u79f0\u4e00\u822c\u7528\u4e8e\u5efa\u7b51\u884c\u4e1a\u4e0d\u540c\u7684\u5730\u65b9\u6709\u4e0d\u540c\u7684\u91c7\u4fdd\u8d39\u7387\u7684\u89c4\u5b9a\u4e00\u822c\u8ba1\u7b97\u516c\u5f0f\u662f\u91c7\u4fdd\u8d39=(\u539f\u4ef7+\u5305\u88c5\u8d39+\u4f9b\u9500\u90e8\u95e8\u624b\u7eed\u8d39+\u8fd0\u6742\u8d39)X\u91c7\u8d2d\u53ca\u4fdd\u7ba1\u8d39\u7387"} -------------------------------------------------------------------------------- /dangdang/book/log/failed_page_url: -------------------------------------------------------------------------------- 1 | div do not match pattern in url;http://category.dangdang.com/pg79-cp01.43.50.00.00.00.html 2 | div do not match pattern in url;http://category.dangdang.com/pg33-cp01.43.64.00.00.00.html 3 | div do not match pattern in url;http://category.dangdang.com/pg62-cp01.43.67.00.00.00.html 4 | div do not match pattern in url;http://category.dangdang.com/pg68-cp01.21.08.00.00.00.html 5 | div do not match pattern in url;http://category.dangdang.com/pg68-cp01.54.02.00.00.00.html 6 | -------------------------------------------------------------------------------- /douban/douban_book/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resultdb": "mysql+resultdb://root:root@127.0.0.1:3306/resultdb", 3 | "taskdb": "mysql+taskdb://root:root@127.0.0.1:3306/taskdb", 4 | "projectdb": "mysql+projectdb://root:root@127.0.0.1:3306/projectdb", 5 | "message_queue": "redis://127.0.0.1:6379/0", 6 | "queue-maxsize": 600, 7 | "debug": true, 8 | "webui": { 9 | "port": 51001 10 | }, 11 | "result_worker": { 12 | "result_cls": "result.DoubanResultWorker" 13 | } 14 | } -------------------------------------------------------------------------------- /sohu/sys/hot_topic_url: -------------------------------------------------------------------------------- 1 | http://club.news.sohu.com/minjian/threads 2 | http://yule.club.sohu.com/bagua/threads 3 | http://club.women.sohu.com/shopping/threads 4 | http://club.cul.sohu.com/chuyushe/threads 5 | http://club.baobao.sohu.com/mom_daugh/threads 6 | http://club.sports.sohu.com/allgame/threads 7 | http://club.chihe.sohu.com/food/threads 8 | http://club.news.sohu.com/zz0580/threads 9 | http://club.travel.sohu.com/togher/threads 10 | http://club.learning.sohu.com/szhengzhi/threads 11 | http://club.stock.sohu.com/stock/threads 12 | http://club.money.sohu.com/licai/threads 13 | http://club.women.sohu.com/zz0894/threads 14 | http://club.health.sohu.com/nutrition/threads 15 | -------------------------------------------------------------------------------- /yhd/food_milk/tst_rewrite.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | import codecs 3 | import os 4 | PATH = os.path.dirname(os.path.abspath(__file__)) 5 | filename = os.path.join(PATH, 'item_id_bak/fruit_item_id') 6 | with codecs.open(filename, encoding='utf-8') as f: 7 | item_id_list= f.readlines() 8 | item_id_set = set(item_id_list) 9 | list_length, set_length = len(item_id_list), len(item_id_set) 10 | print list_length, set_length 11 | # item_id_set = set(f.readlines()) 12 | # print len(item_id_list), len(item_id_set) 13 | # item_id_set = set(f.readlines()) 14 | # with codecs.open(filename, mode='wb', encoding='utf-8') as wf: 15 | # wf.writelines(item_id_set) 16 | -------------------------------------------------------------------------------- /mop/sys/hot_topic_urls: -------------------------------------------------------------------------------- 1 | http://tt.mop.com/topic/list_209_210_0_0.html 2 | http://tt.mop.com/topic/list_1_8_0_0.html 3 | http://tt.mop.com/topic/list_94_48_0_0.html 4 | http://tt.mop.com/topic/list_70_19_0_0.html 5 | http://tt.mop.com/topic/list_209_43_0_0.html 6 | http://tt.mop.com/topic/list_213_17_0_0.html 7 | http://tt.mop.com/topic/list_412_51_0_0.html 8 | http://tt.mop.com/topic/list_394_13_0_0.html 9 | http://tt.mop.com/topic/list_213_214_0_0.html 10 | http://tt.mop.com/topic/list_213_217_0_0.html 11 | http://tt.mop.com/topic/list_462_438_0_0.html 12 | http://tt.mop.com/topic/list_291_294_0_0.html 13 | http://tt.mop.com/topic/list_1_14_0_0.html 14 | http://tt.mop.com/topic/list_237_238_0_0.html 15 | -------------------------------------------------------------------------------- /jd/config.ini: -------------------------------------------------------------------------------- 1 | [root] 2 | root_url_pattern = http://item.jd.com/%s.html 3 | [book] 4 | crawled_id_file = crawled_id_url 5 | item_id_filename = book_item_ids 6 | [computer] 7 | crawled_id_filename = crawled_id_url 8 | item_id_filename = computer_item_id 9 | item_content_filename = comnputer 10 | failed_id_filename = failed_id_url 11 | [electronic] 12 | item_content_filename = electronic 13 | item_id_filename = electronic_item_id 14 | crawled_id_filename = crawled_id_url 15 | failed_id_filename = failed_id_url 16 | [digital] 17 | item_id_filename = digital_item_id 18 | crawled_id_filename = crawled_id_url 19 | failed_id_filename = failed_id_url 20 | [phone] 21 | item_id_filename = phone_item_id 22 | crawled_id_filename = crawled_id_url 23 | failed_id_filename = failed_id_url 24 | -------------------------------------------------------------------------------- /yhd/food_milk/remove_repeat_ids.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | import os 3 | import sys 4 | import codecs 5 | 6 | PATH = os.path.dirname(os.path.abspath(__file__)) 7 | walk_path = os.path.join(PATH, 'item_id_bak') 8 | def check_length(): 9 | for root, subdirs, files in os.walk(walk_path): 10 | for id_file in files: 11 | id_filename = os.path.join(walk_path,id_file) 12 | with codecs.open(id_filename) as f: 13 | item_id_list = f.readlines() 14 | item_id_set = set(item_id_list) 15 | list_length, set_length = len(item_id_list), len(item_id_set) 16 | print '%s:%s ==> %s'%(id_file, list_length, set_length) 17 | # with codecs.open(id_filename, mode='wb', encoding='utf-8') as wf: 18 | # wf.writelines(item_id_set) 19 | check_length() 20 | -------------------------------------------------------------------------------- /douban/douban_book/db/douban_book.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE if EXISTS douban_books; 3 | CREATE TABLE `douban_books` ( 4 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 5 | `book` varchar(32) NOT NULL DEFAULT '' COMMENT '图书名称', 6 | `book_id` varchar(16) NOT NULL DEFAULT '' COMMENT '图书唯一id', 7 | `author_name` varchar(32) NOT NULL DEFAULT '' COMMENT '作者名称', 8 | `publish_time` varchar(32) NOT NULL DEFAULT '' COMMENT '出版时间', 9 | `score` tinyint(4) NOT NULL DEFAULT 0 COMMENT '豆瓣评分', 10 | `comment_count` int(10) NOT NULL DEFAULT 0 COMMENT '评论人数', 11 | `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 12 | `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', 13 | PRIMARY KEY (`id`), 14 | KEY `idx_s` (`score`), 15 | KEY `idx_ccnt` (`comment_count`), 16 | KEY `idx_uptime` (`update_time`) 17 | )ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='豆瓣图书爬虫'; -------------------------------------------------------------------------------- /baidu/sys/xici_proxy: -------------------------------------------------------------------------------- 1 | 221.7.11.72:80 2 | 221.7.11.75:80 3 | 221.7.11.10:82 4 | 61.135.153.22:80 5 | 221.7.11.68:83 6 | 221.7.11.11:81 7 | 221.7.11.74:80 8 | 221.7.11.18:80 9 | 221.7.11.8:80 10 | 122.96.59.106:80 11 | 122.96.59.102:83 12 | 182.98.163.166:3128 13 | 121.10.120.135:8001 14 | 218.207.195.206:80 15 | 221.7.11.110:80 16 | 122.96.59.99:80 17 | 112.95.241.76:80 18 | 222.87.129.30:80 19 | 221.7.11.71:80 20 | 221.7.11.17:80 21 | 221.7.11.22:80 22 | 221.7.11.19:80 23 | 121.14.228.16:21320 24 | 202.108.50.75:80 25 | 221.7.11.108:80 26 | 180.96.61.63:80 27 | 119.188.46.42:8080 28 | 61.174.9.96:8080 29 | 211.151.50.179:81 30 | 58.68.246.12:18080 31 | 202.171.253.103:80 32 | 115.236.59.194:3128 33 | 183.141.64.53:80 34 | 221.7.11.70:80 35 | 106.3.40.249:8081 36 | 222.66.115.233:80 37 | 114.80.136.112:7780 38 | 221.7.11.9:80 39 | 114.112.69.21:81 40 | 221.7.11.25:80 41 | 117.25.129.238:8888 42 | 210.14.138.102:8080 43 | 118.144.147.240:18186 44 | 221.176.14.72:80 45 | 202.98.123.126:8080 46 | 180.153.32.93:8088 47 | -------------------------------------------------------------------------------- /sina/zip_file.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import zipfile 3 | import time 4 | import os 5 | 6 | PATH = os.path.dirname(os.path.abspath(__file__)) 7 | date_dir = time.strftime('%Y_%m_%d') 8 | def zip_file(): 9 | '''压缩data_dir目录下的shtml页面''' 10 | src_file = os.path.join(PATH,'html', date_dir) 11 | zipfilename = os.path.join(PATH, 'html', '%s.zip'%date_dir) 12 | filelist = [] 13 | for root, dirs, files in os.walk(src_file): 14 | for filename in files: 15 | filelist.append(os.path.join(root, filename)) 16 | zf = zipfile.ZipFile(zipfilename, mode='w', compression=zipfile.ZIP_DEFLATED) 17 | for fullpath_name in filelist: 18 | arcname = fullpath_name[len(src_file):] 19 | zf.write(fullpath_name, arcname) 20 | zf.close() 21 | # zip_file() 22 | def remove_orginal_dir(): 23 | '''删除date_dir目录''' 24 | date_dir_fullpath = os.path.join(PATH, 'html', date_dir) 25 | print date_dir_fullpath 26 | os.system('rm -r %s'%date_dir_fullpath) 27 | remove_orginal_dir() -------------------------------------------------------------------------------- /baidu/_gevent.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | 3 | import time 4 | import urllib2 5 | import gevent 6 | from bs4 import BeautifulSoup 7 | import gevent.monkey 8 | 9 | gevent.monkey.patch_socket() 10 | 11 | hosts = ["http://www.baidu.com", "http://www.amazon.com","http://www.ibm.com", 12 | "http://www.python.org","http://www.microsoft.com"] 13 | 14 | def read(host): 15 | try: 16 | context = urllib2.urlopen(host) 17 | except urllib2.URLError: 18 | print "load %s failure." %host 19 | return 20 | try: 21 | title = BeautifulSoup(context).title.string 22 | except: 23 | print "paser %s tile failure" %host 24 | return 25 | print "%s : %s" %(host,title) 26 | 27 | def concuyRead(): 28 | start = time.time() 29 | threads = [] 30 | for i in range(30): 31 | for host in hosts: 32 | threads.append(gevent.spawn(read,host)) 33 | gevent.joinall(threads) 34 | end = time.time() 35 | print "Elapsed Time : %d" %(end-start) 36 | 37 | if __name__ == '__main__': 38 | concuyRead() 39 | -------------------------------------------------------------------------------- /jd/home_furniture_decoration_kitchenware/kitchenware/log/crawled_page_url: -------------------------------------------------------------------------------- 1 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-1-1-1-72-4137-33.html 2 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-2-1-1-72-4137-33.html 3 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-3-1-1-72-4137-33.html 4 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-4-1-1-72-4137-33.html 5 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-5-1-1-72-4137-33.html 6 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-6-1-1-72-4137-33.html 7 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-7-1-1-72-4137-33.html 8 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-8-1-1-72-4137-33.html 9 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-9-1-1-72-4137-33.html 10 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-10-1-1-72-4137-33.html 11 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-11-1-1-72-4137-33.html 12 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-12-1-1-72-4137-33.html 13 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-13-1-1-72-4137-33.html 14 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-14-1-1-72-4137-33.html 15 | -------------------------------------------------------------------------------- /douban/douban_book/db/load_config.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- 2 | import os 3 | import re 4 | import codecs 5 | LINE_PATTERN = re.compile(r'\A\s*(?P.+?)\s*=\s*(?P.*?)\s*\Z') 6 | 7 | 8 | def load_config(config_file_path): 9 | if not os.path.isfile(config_file_path): 10 | raise ValueError('Error: file not found %s' % config_file_path) 11 | entries = {} 12 | line_num = 0 13 | for line in codecs.open(config_file_path, mode='r', encoding='utf_8', errors='ignore'): 14 | line_num += 1 15 | line = line.strip() 16 | if not line: 17 | continue # skip empty line 18 | if line[0] == ';' or line[0] == '#': 19 | continue # skip comment line 20 | match = LINE_PATTERN.match(line) 21 | if not match: 22 | print('Warning(file %s, line %d): invalid entry' % (config_file_path, line_num)) 23 | continue 24 | key = match.group('key') 25 | if key in entries: 26 | print('Warning(file %s, line %d): duplicated entry ignored for "%s"' % (config_file_path, line_num, key)) 27 | continue 28 | entries[key] = match.group('val') 29 | return entries 30 | -------------------------------------------------------------------------------- /baidu/sys/proxy_ip_port: -------------------------------------------------------------------------------- 1 | 42.3.224.11:8080 2 | 42.120.22.25:3128 3 | 42.121.59.142:82 4 | 58.22.0.55:80 5 | 58.53.128.201:8888 6 | 58.205.228.66:8000 7 | 58.205.228.66:8080 8 | 58.205.228.67:8000 9 | 58.205.228.67:8080 10 | 58.215.52.159:8080 11 | 59.38.32.35:1111 12 | 59.46.72.245:8080 13 | 59.78.194.89:80 14 | 59.172.208.186:8080 15 | 60.18.147.109:8085 16 | 60.21.136.22:8080 17 | 61.53.64.37:8080 18 | 61.134.38.42:7280 19 | 61.153.236.30:8080 20 | 61.156.235.170:9999 21 | 61.156.235.172:9999 22 | 61.158.168.237:9999 23 | 61.158.219.226:8118 24 | 61.164.73.19:82 25 | 63.221.140.143:80 26 | 103.27.108.120:80 27 | 106.37.177.251:3128 28 | 106.120.108.163:3128 29 | 111.1.60.210:80 30 | 111.161.126.83:8080 31 | 111.161.126.87:8080 32 | 111.161.126.88:8080 33 | 111.161.126.90:8080 34 | 111.161.126.92:8080 35 | 111.161.126.93:8080 36 | 112.5.183.235:80 37 | 112.5.183.235:81 38 | 112.5.183.235:82 39 | 112.90.146.76:3128 40 | 112.124.3.76:80 41 | 112.124.8.200:8080 42 | 112.124.27.112:82 43 | 112.124.28.212:80 44 | 112.124.59.13:9090 45 | 112.124.103.195:80 46 | 112.125.17.5:80 47 | 112.253.6.182:8080 48 | 113.57.252.103:80 49 | 113.57.252.104:80 50 | 113.57.252.105:80 51 | 113.57.252.107:80 52 | 113.107.43.75:818 53 | 113.108.181.171:1127 54 | -------------------------------------------------------------------------------- /mop/sys/msg_urls: -------------------------------------------------------------------------------- 1 | http://tt.mop.com/topic/list_209_210_0_1_2.html 2 | http://tt.mop.com/topic/list_209_210_0_2_2.html 3 | http://tt.mop.com/topic/list_1_8_0_1_2.html 4 | http://tt.mop.com/topic/list_1_8_0_2_2.html 5 | http://tt.mop.com/topic/list_94_48_0_1_2.html 6 | http://tt.mop.com/topic/list_94_48_0_2_2.html 7 | http://tt.mop.com/topic/list_70_19_0_1_2.html 8 | http://tt.mop.com/topic/list_70_19_0_2_2.html 9 | http://tt.mop.com/topic/list_209_43_0_1_2.html 10 | http://tt.mop.com/topic/list_209_43_0_2_2.html 11 | http://tt.mop.com/topic/list_213_17_0_1_2.html 12 | http://tt.mop.com/topic/list_213_17_0_2_2.html 13 | http://tt.mop.com/topic/list_412_51_0_1_2.html 14 | http://tt.mop.com/topic/list_412_51_0_2_2.html 15 | http://tt.mop.com/topic/list_394_13_0_1_2.html 16 | http://tt.mop.com/topic/list_394_13_0_2_2.html 17 | http://tt.mop.com/topic/list_213_214_0_1_2.html 18 | http://tt.mop.com/topic/list_213_214_0_2_2.html 19 | http://tt.mop.com/topic/list_213_217_0_1_2.html 20 | http://tt.mop.com/topic/list_213_217_0_2_2.html 21 | http://tt.mop.com/topic/list_462_438_0_1_2.html 22 | http://tt.mop.com/topic/list_462_438_0_2_2.html 23 | http://tt.mop.com/topic/list_291_294_0_1_2.html 24 | http://tt.mop.com/topic/list_291_294_0_2_2.html 25 | http://tt.mop.com/topic/list_1_14_0_1_2.html 26 | http://tt.mop.com/topic/list_1_14_0_2_2.html 27 | http://tt.mop.com/topic/list_237_238_0_1_2.html 28 | http://tt.mop.com/topic/list_237_238_0_2_2.html 29 | -------------------------------------------------------------------------------- /douban/sys/xici_proxy: -------------------------------------------------------------------------------- 1 | 210.73.220.18:8088 2 | 115.227.193.238:80 3 | 60.5.252.76:80 4 | 58.20.223.230:3128 5 | 221.7.11.11:81 6 | 58.20.127.90:3128 7 | 211.151.50.179:81 8 | 120.198.230.11:80 9 | 122.96.59.102:83 10 | 122.96.59.106:80 11 | 115.238.243.202:80 12 | 210.14.138.102:8080 13 | 58.20.127.178:3128 14 | 122.96.59.99:80 15 | 221.10.40.234:843 16 | 183.141.68.197:80 17 | 112.4.17.152:8080 18 | 218.92.169.170:18186 19 | 121.14.145.96:8090 20 | 218.28.96.39:3128 21 | 61.164.73.19:82 22 | 58.53.128.201:8888 23 | 116.213.211.139:9090 24 | 210.39.18.236:8080 25 | 106.3.40.249:8081 26 | 222.66.115.233:80 27 | 218.64.255.253:3128 28 | 119.188.46.42:8080 29 | 114.80.136.112:7780 30 | 221.130.29.184:8888 31 | 117.25.129.238:8888 32 | 115.29.178.235:8088 33 | 220.161.209.150:80 34 | 180.166.67.100:8080 35 | 114.112.91.135:3128 36 | 222.87.129.29:80 37 | 202.98.123.126:8080 38 | 59.127.173.42:8888 39 | 202.202.0.163:3128 40 | 202.108.50.75:80 41 | 42.121.105.155:8888 42 | 118.26.57.14:80 43 | 218.108.170.166:82 44 | 125.39.66.67:80 45 | 122.226.120.4:80 46 | 115.29.164.195:8081 47 | 211.152.50.70:80 48 | 58.20.127.26:3128 49 | 115.236.59.194:3128 50 | 58.20.127.100:3128 51 | 125.39.66.66:80 52 | 58.20.127.106:3128 53 | 61.174.9.96:8080 54 | 140.206.86.68:8080 55 | 121.10.120.135:8001 56 | 58.22.0.54:80 57 | 115.29.3.163:80 58 | 112.95.241.76:80 59 | 202.96.155.251:8888 60 | 218.213.168.131:80 61 | 218.204.89.117:8123 62 | 58.221.250.246:18186 63 | -------------------------------------------------------------------------------- /dangdang/life/sys/life_item_id: -------------------------------------------------------------------------------- 1 | http://product.dangdang.com/60542570.html 2 | http://product.dangdang.com/400926082.html 3 | http://product.dangdang.com/400926055.html 4 | http://product.dangdang.com/60542568.html 5 | http://product.dangdang.com/60542573.html 6 | http://product.dangdang.com/60542572.html 7 | http://product.dangdang.com/60542569.html 8 | http://product.dangdang.com/400926086.html 9 | http://product.dangdang.com/400926070.html 10 | http://product.dangdang.com/400926080.html 11 | http://product.dangdang.com/60542571.html 12 | http://product.dangdang.com/60543050.html 13 | http://product.dangdang.com/60543048.html 14 | http://product.dangdang.com/60543057.html 15 | http://product.dangdang.com/60543055.html 16 | http://product.dangdang.com/60543059.html 17 | http://product.dangdang.com/60543052.html 18 | http://product.dangdang.com/60543049.html 19 | http://product.dangdang.com/60543051.html 20 | http://product.dangdang.com/60543053.html 21 | http://product.dangdang.com/60543060.html 22 | http://product.dangdang.com/60543054.html 23 | http://product.dangdang.com/60543056.html 24 | http://product.dangdang.com/60543058.html 25 | http://product.dangdang.com/1800382664.html 26 | http://product.dangdang.com/60542570.html 27 | http://product.dangdang.com/60542568.html 28 | http://product.dangdang.com/60542573.html 29 | http://product.dangdang.com/60542572.html 30 | http://product.dangdang.com/60542569.html 31 | http://product.dangdang.com/60542571.html 32 | -------------------------------------------------------------------------------- /jd/xici_proxy: -------------------------------------------------------------------------------- 1 | 202.108.50.75:80 2 | 61.135.153.22:80 3 | 221.7.11.19:80 4 | 42.121.105.155:8888 5 | 117.59.217.214:80 6 | 114.80.136.112:7780 7 | 218.207.195.206:80 8 | 218.207.195.221:80 9 | 59.125.43.139:8080 10 | 221.7.11.22:80 11 | 125.39.70.182:8080 12 | 112.95.241.76:80 13 | 58.241.243.218:8083 14 | 58.22.0.55:83 15 | 114.112.91.135:3128 16 | 221.130.29.184:8888 17 | 121.14.228.16:21320 18 | 183.230.127.59:8088 19 | 210.14.138.102:8080 20 | 61.177.65.251:18186 21 | 58.20.127.100:3128 22 | 58.53.128.201:8888 23 | 115.29.164.195:8081 24 | 58.20.127.106:3128 25 | 220.161.209.150:80 26 | 58.221.250.246:18186 27 | 202.108.50.75:80 28 | 222.66.115.233:80 29 | 218.207.195.206:80 30 | 58.20.223.230:3128 31 | 221.7.11.11:81 32 | 58.53.128.201:8888 33 | 58.20.127.90:3128 34 | 58.22.106.230:8888 35 | 117.25.129.238:8888 36 | 58.68.246.12:18080 37 | 122.96.59.106:80 38 | 122.96.59.102:83 39 | 210.14.138.102:8080 40 | 221.7.11.71:80 41 | 122.96.59.99:80 42 | 222.87.129.30:80 43 | 61.135.153.22:80 44 | 114.112.69.21:81 45 | 115.29.168.245:18080 46 | 220.165.8.174:80 47 | 115.29.3.163:80 48 | 58.20.127.106:3128 49 | 221.130.29.184:8888 50 | 58.20.127.26:3128 51 | 61.174.9.96:8080 52 | 202.98.123.126:8080 53 | 211.139.93.87:18186 54 | 180.153.32.93:8088 55 | 182.98.163.166:3128 56 | 61.164.73.19:82 57 | 211.151.50.179:81 58 | 210.73.220.18:8088 59 | 106.3.40.249:8081 60 | 121.10.120.135:8001 61 | 58.215.139.136:8088 62 | 221.7.11.19:80 63 | 59.125.43.139:8080 64 | 221.7.11.22:80 65 | 112.95.241.76:80 66 | 58.241.243.218:8083 67 | 183.230.127.59:8088 68 | 61.177.65.251:18186 69 | 58.20.127.100:3128 70 | 220.161.209.150:80 71 | 58.221.250.246:18186 72 | 180.96.61.63:80 73 | -------------------------------------------------------------------------------- /douban/movie_html.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | import os 3 | import re 4 | import time 5 | import urllib2 6 | import codecs 7 | 8 | PATH = os.path.dirname(os.path.abspath(__file__)) 9 | 10 | def main(): 11 | item_url_filename = os.path.join(PATH, 'sys', 'total_item_url') 12 | html_failed_filename = os.path.join(PATH, 'log', 'html_failed_url') 13 | with codecs.open(item_url_filename, encoding='utf-8') as total_item_url_f,\ 14 | codecs.open(html_failed_filename, mode='wb', encoding='utf-8')as failed_wf: 15 | url_list = [item.strip() for item in set(total_item_url_f.readlines())] 16 | for url in url_list: 17 | item_id = re.search('\d+', url).group() 18 | try: 19 | html = urllib2.urlopen(url).read() 20 | except BaseException: 21 | try: 22 | html = urllib2.urlopen(url).read() 23 | except BaseException: 24 | try: 25 | html = urllib2.urlopen(url).read() 26 | except BaseException: 27 | failed_wf.write(url+'\n') 28 | continue 29 | html_filename = os.path.join(PATH, 'html', '%s.html'%str(item_id)) 30 | with open(html_filename, mode='wb') as wf: 31 | wf.write(html) 32 | time.sleep(3) 33 | main() 34 | 35 | def read_one_item_url(): 36 | url = 'http://movie.douban.com/subject/22265121/' 37 | item_id = re.search('\d+', url).group() 38 | filename = os.path.join(PATH, '%s.html'%item_id) 39 | html = urllib2.urlopen(url).read() 40 | with codecs.open(filename, mode='wb') as wf: 41 | wf.write(html) 42 | # read_one_item_url() -------------------------------------------------------------------------------- /proxy/you.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import urllib2 4 | import os 5 | import re 6 | import time 7 | import codecs 8 | from bs4 import BeautifulSoup 9 | 10 | PATH = os.path.dirname(os.path.abspath(__file__)) 11 | VARIFY_URL = "http://42.96.192.46/echo_ip" 12 | 13 | def Varify_proxy(ip_port): 14 | http_proxy = 'http://%s'%ip_port 15 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy}) 16 | opener = urllib2.build_opener(proxy_hanlder) 17 | urllib2.install_opener(opener) 18 | try: 19 | ip = urllib2.urlopen(VARIFY_URL, timeout=10).read().strip() 20 | if ip == ip_port.split(':')[0]: 21 | print 'successed ip : %s'%ip 22 | return True 23 | else: 24 | print 'failed ip : %s'%ip 25 | return 26 | except: 27 | print 'timeout request...ip:%s'%ip_port 28 | return 29 | 30 | def baidu_varify(ip_port): 31 | '''对百度进行过滤''' 32 | url = 'http://baike.baidu.com/view/3570272.htm' 33 | http_proxy = 'http://%s'%ip_port 34 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy}) 35 | opener = urllib2.build_opener(proxy_hanlder) 36 | urllib2.install_opener(opener) 37 | try: 38 | html = urllib2.urlopen(url, timeout=10) 39 | soup = BeautifulSoup(html) 40 | para_level_list = soup.find_all('div', class_='para') 41 | if not para_level_list: 42 | return 43 | else: 44 | return True 45 | except: 46 | print 'timed item_id...in ip_port:%s'%ip_port 47 | return 48 | def read_one_page(): 49 | url = 'http://www.youdaili.cn/Daili/guonei/list_1.html' 50 | html = urllib2.urlopen(url).read() 51 | soup = BeautifulSoup(html) 52 | ul_level_str = soup.find('ul', class_='newslist_line') 53 | li_level_srt = ul_level_str.find('li') 54 | print li_level_srt.a['href'] 55 | 56 | read_one_page() -------------------------------------------------------------------------------- /douban/douban_book/result_db.py: -------------------------------------------------------------------------------- 1 | #!-*- coding:utf-8 -*- 2 | import os 3 | import json 4 | import mysql.connector 5 | from pyspider.database.mysql.resultdb import BaseDB 6 | 7 | cur_path = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | class DoubanResultDB(BaseDB): 10 | __tablename__ = 'douban_books' 11 | 12 | def __init__(self, host='localhost', port=3306, database='resultdb', 13 | user='root', passwd=None): 14 | self.config_jdata = self._load_config() 15 | host = self.config_jdata.get('host') or host 16 | port = self.config_jdata.get('port') or port 17 | database = self.config_jdata.get('database') or database 18 | user = self.config_jdata.get('user') or user 19 | passwd = self.config_jdata.get('passwd') or passwd 20 | self.database_name = database 21 | self.conn = mysql.connector.connect(user=user, password=passwd, 22 | host=host, port=port, autocommit=True, 23 | database=database) 24 | 25 | def _load_config(self): 26 | config_path = os.path.join(cur_path, 'conf/mysql.json') 27 | con = open(config_path).read() 28 | return json.loads(con) 29 | 30 | @property 31 | def dbcur(self): 32 | try: 33 | if self.conn.unread_result: 34 | self.conn.get_rows() 35 | return self.conn.cursor() 36 | except (mysql.connector.OperationalError, mysql.connector.InterfaceError): 37 | self.conn.ping(reconnect=True) 38 | self.conn.database = self.database_name 39 | return self.conn.cursor() 40 | 41 | 42 | db = DoubanResultDB() 43 | 44 | if __name__ == "__main__": 45 | data = { 46 | 'book': 'book', 47 | 'book_id': '12312', 48 | 'author_name': 'huafeng', 49 | 'score': '9.1', 50 | 'comment_count': '13312312', 51 | 'publish_time': '2' 52 | } 53 | db._replace(**data) 54 | # sql = 'REPLACE INTO `douban_books` (`book_id`) VALUES (13312312)' 55 | # douban_db.dbcur.execute(sql) -------------------------------------------------------------------------------- /music/tst_music.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import os 4 | import re 5 | import urllib2 6 | import codecs 7 | import time 8 | import random 9 | from bs4 import BeautifulSoup 10 | 11 | PATH = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | def gen_item_url(): 14 | day_hot_url = "http://music.baidu.com/top/dayhot" 15 | new_url = "http://music.baidu.com/top/new" 16 | url_root = 'http://music.baidu.com/' 17 | url_compile_filter = re.compile(r'/song/[\d]+') 18 | html = urllib2.urlopen(new_url).read() 19 | soup = BeautifulSoup(html) 20 | span_level_list = soup.find_all('span', class_='song-title') 21 | # span_level_str = ''' 22 | # 时间都去哪儿了 23 | # ''' 25 | whole_url_list = [item.a['href'] for item in span_level_list] 26 | item_url_list = ["".join((url_root, url)) for url in whole_url_list if url_compile_filter.match(url)] 27 | print item_url_list, len(item_url_list) 28 | # gen_item_url() 29 | def parse_one_page(): 30 | url = 'http://music.baidu.com/song/118422882' 31 | html = urllib2.urlopen(url).read() 32 | soup = BeautifulSoup(html) 33 | div_level_str = soup.find('div', id='lyricCont') 34 | music_con = div_level_str.text 35 | splited_con = music_con.split('\n') 36 | print len(splited_con),splited_con[20] 37 | for i in splited_con: 38 | print i 39 | # parse_one_page() 40 | def filter_unformal_url(): 41 | unformal_url = "http://y.baidu.com/gotoartist/song/23648?play_song=23648&pst=songList" 42 | formal_url = '/song/118170108' 43 | url_compile_filter = re.compile(r'/song/[\d]+') 44 | match = url_compile_filter.match(formal_url) 45 | if match: 46 | print match.group() 47 | # filter_unformal_url() 48 | def write_music_content_into_file(): 49 | timestamp = time.strftime('%Y_%m_%d_%H%M%S_baidu_music.txt') 50 | filename = os.path.join(PATH, 'out', timestamp) 51 | print filename 52 | # write_music_content_into_file() 53 | -------------------------------------------------------------------------------- /mop/sys/comment_urls: -------------------------------------------------------------------------------- 1 | http://tt.mop.com/topic/list_209_210_0_1_1.html 2 | http://tt.mop.com/topic/list_209_210_0_2_1.html 3 | http://tt.mop.com/topic/list_209_210_0_3_1.html 4 | http://tt.mop.com/topic/list_1_8_0_1_1.html 5 | http://tt.mop.com/topic/list_1_8_0_2_1.html 6 | http://tt.mop.com/topic/list_1_8_0_3_1.html 7 | http://tt.mop.com/topic/list_94_48_0_1_1.html 8 | http://tt.mop.com/topic/list_94_48_0_2_1.html 9 | http://tt.mop.com/topic/list_94_48_0_3_1.html 10 | http://tt.mop.com/topic/list_70_19_0_1_1.html 11 | http://tt.mop.com/topic/list_70_19_0_2_1.html 12 | http://tt.mop.com/topic/list_70_19_0_3_1.html 13 | http://tt.mop.com/topic/list_209_43_0_1_1.html 14 | http://tt.mop.com/topic/list_209_43_0_2_1.html 15 | http://tt.mop.com/topic/list_209_43_0_3_1.html 16 | http://tt.mop.com/topic/list_213_17_0_1_1.html 17 | http://tt.mop.com/topic/list_213_17_0_2_1.html 18 | http://tt.mop.com/topic/list_213_17_0_3_1.html 19 | http://tt.mop.com/topic/list_412_51_0_1_1.html 20 | http://tt.mop.com/topic/list_412_51_0_2_1.html 21 | http://tt.mop.com/topic/list_412_51_0_3_1.html 22 | http://tt.mop.com/topic/list_394_13_0_1_1.html 23 | http://tt.mop.com/topic/list_394_13_0_2_1.html 24 | http://tt.mop.com/topic/list_394_13_0_3_1.html 25 | http://tt.mop.com/topic/list_213_214_0_1_1.html 26 | http://tt.mop.com/topic/list_213_214_0_2_1.html 27 | http://tt.mop.com/topic/list_213_214_0_3_1.html 28 | http://tt.mop.com/topic/list_213_217_0_1_1.html 29 | http://tt.mop.com/topic/list_213_217_0_2_1.html 30 | http://tt.mop.com/topic/list_213_217_0_3_1.html 31 | http://tt.mop.com/topic/list_462_438_0_1_1.html 32 | http://tt.mop.com/topic/list_462_438_0_2_1.html 33 | http://tt.mop.com/topic/list_462_438_0_3_1.html 34 | http://tt.mop.com/topic/list_291_294_0_1_1.html 35 | http://tt.mop.com/topic/list_291_294_0_2_1.html 36 | http://tt.mop.com/topic/list_291_294_0_3_1.html 37 | http://tt.mop.com/topic/list_1_14_0_1_1.html 38 | http://tt.mop.com/topic/list_1_14_0_2_1.html 39 | http://tt.mop.com/topic/list_1_14_0_3_1.html 40 | http://tt.mop.com/topic/list_237_238_0_1_1.html 41 | http://tt.mop.com/topic/list_237_238_0_2_1.html 42 | http://tt.mop.com/topic/list_237_238_0_3_1.html 43 | -------------------------------------------------------------------------------- /baidu_word_freq/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 1397473323555 37 | 1397473323555 38 | 39 | 40 | 41 | 42 | 44 | 45 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /dangdang/baby_kid/baby/sys/baby_item_id: -------------------------------------------------------------------------------- 1 | http://product.dangdang.com/60313937.html 2 | http://product.dangdang.com/60313936.html 3 | http://product.dangdang.com/20647147.html 4 | http://product.dangdang.com/20647146.html 5 | http://product.dangdang.com/60203040.html 6 | http://product.dangdang.com/60203039.html 7 | http://product.dangdang.com/60313938.html 8 | http://product.dangdang.com/60304433.html 9 | http://product.dangdang.com/20647154.html 10 | http://product.dangdang.com/20226095.html 11 | http://product.dangdang.com/60067062.html 12 | http://product.dangdang.com/60566394.html 13 | http://product.dangdang.com/60304434.html 14 | http://product.dangdang.com/60142495.html 15 | http://product.dangdang.com/60313940.html 16 | http://product.dangdang.com/60552181.html 17 | http://product.dangdang.com/60067061.html 18 | http://product.dangdang.com/60313945.html 19 | http://product.dangdang.com/60555576.html 20 | http://product.dangdang.com/60203038.html 21 | http://product.dangdang.com/60313935.html 22 | http://product.dangdang.com/60067063.html 23 | http://product.dangdang.com/60200958.html 24 | http://product.dangdang.com/60272250.html 25 | http://product.dangdang.com/60272249.html 26 | http://product.dangdang.com/60567884.html 27 | http://product.dangdang.com/60272248.html 28 | http://product.dangdang.com/20226094.html 29 | http://product.dangdang.com/60332633.html 30 | http://product.dangdang.com/60555584.html 31 | http://product.dangdang.com/20161237.html 32 | http://product.dangdang.com/20226097.html 33 | http://product.dangdang.com/60555577.html 34 | http://product.dangdang.com/60272247.html 35 | http://product.dangdang.com/20226096.html 36 | http://product.dangdang.com/60034385.html 37 | http://product.dangdang.com/1291165605.html 38 | http://product.dangdang.com/60555575.html 39 | http://product.dangdang.com/1291108305.html 40 | http://product.dangdang.com/60555578.html 41 | http://product.dangdang.com/60032991.html 42 | http://product.dangdang.com/60083212.html 43 | http://product.dangdang.com/60566393.html 44 | http://product.dangdang.com/60561100.html 45 | http://product.dangdang.com/60555574.html 46 | http://product.dangdang.com/60332625.html 47 | http://product.dangdang.com/1291140005.html 48 | http://product.dangdang.com/60555585.html 49 | -------------------------------------------------------------------------------- /proxy/proxy.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import urllib2 4 | import urllib 5 | import os 6 | import re 7 | import socket 8 | import time 9 | 10 | 11 | def VerifyIp(): 12 | proxy = {'domain':"115.29.3.163", 'port':"80"} # invalid 13 | url = 'http://42.96.192.46/echo_ip' 14 | loc_ip = urllib.urlopen(url).read().strip() 15 | print "loc_ip is :%s"%loc_ip 16 | 17 | start_time = time.time() 18 | http_proxy = 'http://%s:%s/' % (proxy['domain'], proxy['port']) 19 | proxy_support = urllib2.ProxyHandler({'http': http_proxy}) 20 | opener = urllib2.build_opener(proxy_support) 21 | urllib2.install_opener(opener) 22 | request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'}) 23 | ip = urllib2.urlopen(url).read().strip() 24 | if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip) and ip != loc_ip: 25 | print "yep, valid proxy" 26 | else: 27 | print "sorry, invalid proxy" 28 | end_time = time.time() 29 | print end_time-start_time 30 | VerifyIp() 31 | 32 | # url = 'http://42.96.192.46/echo_ip' 33 | # loc_ip = urllib.urlopen(url).read().strip() 34 | # print loc_ip 35 | 36 | def _19lou_proxy_request(): 37 | path = "/home/huafeng/PycharmProjects/spider/19lou/page_url" 38 | filename = os.path.join(path, 'whole_page_urls') 39 | 40 | proxy = {'domain':"5.45.67.109", 'port':"3128"} 41 | http_proxy = 'http://%s:%s/' % (proxy['domain'], proxy['port']) 42 | proxy_support = urllib2.ProxyHandler({'http':http_proxy}) 43 | opener = urllib2.build_opener(proxy_support) 44 | urllib2.install_opener(opener) 45 | 46 | with open(filename) as f: 47 | # for url in f.readlines(): 48 | for url in [item.strip() for item in f.readlines()]: 49 | req = urllib2.Request(url)#, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'}) 50 | try: 51 | html = urllib2.urlopen(req, timeout=5).read().decode('gbk') 52 | print html 53 | # time.sleep(5) 54 | except Exception, e: 55 | continue 56 | print url 57 | 58 | # _19lou_proxy_request() -------------------------------------------------------------------------------- /dangdang/computer/sys/computer_item_id: -------------------------------------------------------------------------------- 1 | http://product.dangdang.com/1043432305.html 2 | http://product.dangdang.com/1057908922.html 3 | http://product.dangdang.com/1135115605.html 4 | http://product.dangdang.com/1058054122.html 5 | http://product.dangdang.com/1184350622.html 6 | http://product.dangdang.com/1232175806.html 7 | http://product.dangdang.com/1446446708.html 8 | http://product.dangdang.com/60571738.html 9 | http://product.dangdang.com/1168951005.html 10 | http://product.dangdang.com/60561695.html 11 | http://product.dangdang.com/1077089205.html 12 | http://product.dangdang.com/60563336.html 13 | http://product.dangdang.com/1154947821.html 14 | http://product.dangdang.com/1231079505.html 15 | http://product.dangdang.com/1088160507.html 16 | http://product.dangdang.com/1201605606.html 17 | http://product.dangdang.com/1203941206.html 18 | http://product.dangdang.com/1156170005.html 19 | http://product.dangdang.com/1068087406.html 20 | http://product.dangdang.com/1379089906.html 21 | http://product.dangdang.com/1011270012.html 22 | http://product.dangdang.com/60569129.html 23 | http://product.dangdang.com/1052354821.html 24 | http://product.dangdang.com/1115249321.html 25 | http://product.dangdang.com/1332081006.html 26 | http://product.dangdang.com/60559033.html 27 | http://product.dangdang.com/1151675207.html 28 | http://product.dangdang.com/1081070005.html 29 | http://product.dangdang.com/1305108905.html 30 | http://product.dangdang.com/1249138106.html 31 | http://product.dangdang.com/1249139706.html 32 | http://product.dangdang.com/1384650106.html 33 | http://product.dangdang.com/1092327506.html 34 | http://product.dangdang.com/1192917502.html 35 | http://product.dangdang.com/1266268505.html 36 | http://product.dangdang.com/1155295921.html 37 | http://product.dangdang.com/1069361805.html 38 | http://product.dangdang.com/1178970822.html 39 | http://product.dangdang.com/60552193.html 40 | http://product.dangdang.com/60556490.html 41 | http://product.dangdang.com/1313963107.html 42 | http://product.dangdang.com/1271342208.html 43 | http://product.dangdang.com/1228122905.html 44 | http://product.dangdang.com/1229043505.html 45 | http://product.dangdang.com/60547556.html 46 | http://product.dangdang.com/1270651308.html 47 | http://product.dangdang.com/1143044111.html 48 | http://product.dangdang.com/1210099205.html 49 | -------------------------------------------------------------------------------- /dangdang/food/sys/food_item_id: -------------------------------------------------------------------------------- 1 | http://product.dangdang.com/1292836808.html 2 | http://product.dangdang.com/1247826808.html 3 | http://product.dangdang.com/1033414421.html 4 | http://product.dangdang.com/1457668908.html 5 | http://product.dangdang.com/1027733106.html 6 | http://product.dangdang.com/1392218408.html 7 | http://product.dangdang.com/1247803708.html 8 | http://product.dangdang.com/1051491122.html 9 | http://product.dangdang.com/1045264621.html 10 | http://product.dangdang.com/1462039808.html 11 | http://product.dangdang.com/1247812508.html 12 | http://product.dangdang.com/1247825908.html 13 | http://product.dangdang.com/1462330308.html 14 | http://product.dangdang.com/1277452208.html 15 | http://product.dangdang.com/1277491008.html 16 | http://product.dangdang.com/1473498908.html 17 | http://product.dangdang.com/1262367108.html 18 | http://product.dangdang.com/1292817108.html 19 | http://product.dangdang.com/1027931006.html 20 | http://product.dangdang.com/1280534508.html 21 | http://product.dangdang.com/1262351508.html 22 | http://product.dangdang.com/1247816108.html 23 | http://product.dangdang.com/1380812808.html 24 | http://product.dangdang.com/1247828408.html 25 | http://product.dangdang.com/1413505808.html 26 | http://product.dangdang.com/1247804508.html 27 | http://product.dangdang.com/1380839408.html 28 | http://product.dangdang.com/1473475608.html 29 | http://product.dangdang.com/1247809008.html 30 | http://product.dangdang.com/1473532208.html 31 | http://product.dangdang.com/1473472908.html 32 | http://product.dangdang.com/1108626206.html 33 | http://product.dangdang.com/1473512308.html 34 | http://product.dangdang.com/1413746308.html 35 | http://product.dangdang.com/1462054008.html 36 | http://product.dangdang.com/1106580321.html 37 | http://product.dangdang.com/1296000008.html 38 | http://product.dangdang.com/1277475908.html 39 | http://product.dangdang.com/1381156408.html 40 | http://product.dangdang.com/1247812308.html 41 | http://product.dangdang.com/1277491408.html 42 | http://product.dangdang.com/1247807008.html 43 | http://product.dangdang.com/1247827808.html 44 | http://product.dangdang.com/1462112708.html 45 | http://product.dangdang.com/1463620608.html 46 | http://product.dangdang.com/1247819308.html 47 | http://product.dangdang.com/1461936808.html 48 | http://product.dangdang.com/1247818408.html 49 | -------------------------------------------------------------------------------- /jd/read_failed_page_url.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import os 4 | import re 5 | import time 6 | import codecs 7 | import urllib2 8 | 9 | end_url_with_pagesize = True 10 | ip_port = '218.207.195.206:80' 11 | PATH = r'/mnt/data/spiders/jd/health' 12 | ITEM_FILENAME = 'freshfood_item_id' 13 | 14 | def get_whole_item_id(): 15 | http_handler = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) 16 | opener = urllib2.build_opener(http_handler) 17 | urllib2.install_opener(opener) 18 | first_failed_url_filename = os.path.join(PATH, 'log', 'failed_item_id') 19 | crawled_page_url_filename = os.path.join(PATH, 'log', 're_crawled_page_url') 20 | failed_page_url_filename = os.path.join(PATH, 'log', 're_failed_page_url') 21 | item_id_filename = os.path.join(PATH, 'page_url', ITEM_FILENAME) 22 | with codecs.open(first_failed_url_filename, encoding='utf-8') as first_failed_url_f,\ 23 | codecs.open(crawled_page_url_filename, mode='wb', encoding='utf-8') as crawled_page_wf, \ 24 | codecs.open(failed_page_url_filename, mode='wb', encoding='utf-8') as failed_page_url_wf, \ 25 | codecs.open(item_id_filename, mode='a', encoding='utf-8') as item_id_wf: 26 | for page_url in [item.split(';')[1].strip() for item in first_failed_url_f.readlines()]: 27 | try: 28 | html = urllib2.urlopen(page_url.strip(), timeout=15).read() 29 | except: 30 | try: 31 | html = urllib2.urlopen(page_url.strip(), timeout=15).read() 32 | except: 33 | failed_page_url_wf.write('timeout in url;%s'%page_url) 34 | continue 35 | if end_url_with_pagesize: 36 | item_id_list = re.findall(r'''sku=\\"(\d+)\\" selfservice''', html) 37 | else: 38 | item_id_list = re.findall(r"sku='(\d+)'>
5] 50 | # print ip_port_type_list, len(ip_port_type_list) 51 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS'] 52 | # print matched_ip_port_list, len(matched_ip_port_list) 53 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list] 54 | http_proxy_list.extend(com_str_list) 55 | 56 | #过滤已被百度查封的IP 57 | com_str_list = [item for item in http_proxy_list if request_in_baidu(item)] 58 | filename = os.path.join(PATH, 'sys', 'xici_proxy') 59 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf: 60 | com_str_list = [item+'\n' for item in com_str_list] 61 | wf.writelines(com_str_list) 62 | gen_proxy() 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /jd/varify_proxy.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import urllib2 4 | import os 5 | import re 6 | import codecs 7 | import time 8 | import random 9 | from bs4 import BeautifulSoup 10 | SLEEP_INTERVAL = random.randint(5,10) 11 | 12 | PATH = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | def varify(ip_port): 15 | url = 'http://item.jd.com/1013330.html' 16 | proxy_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) 17 | opener = urllib2.build_opener(proxy_hanlder) 18 | urllib2.install_opener(opener) 19 | try: 20 | start_time = time.time() 21 | html = urllib2.urlopen(url, timeout=10).read() 22 | soup = BeautifulSoup(html) 23 | content = soup.find('div', id='product-detail-1') 24 | end_time = time.time() 25 | time_consume = end_time - start_time 26 | if not content: 27 | print 'invalid ip_port:%s'%ip_port 28 | return 29 | elif content and time_consume < 1: 30 | print 'success ip_port:%s'%ip_port 31 | print end_time - start_time 32 | return True 33 | except: 34 | print 'timed item_id...in ip_port:%s'%ip_port 35 | return 36 | # varify('') 37 | 38 | def write_proxy_into_file(http_proxy_list): 39 | com_str_list = [item+'\n' for item in http_proxy_list if varify(item)] 40 | filename = os.path.join(PATH, 'xici_proxy') 41 | with codecs.open(filename, mode='a', encoding='utf-8') as wf: 42 | wf.writelines(com_str_list) 43 | 44 | def gen_proxy(): 45 | url_pattern = "http://www.xici.net.co/nn/%s" 46 | url_list = [url_pattern%str(i) for i in range(1,6)] 47 | http_proxy_list = [] 48 | for url in url_list: 49 | try: 50 | html = urllib2.urlopen(url, timeout=15).read() 51 | except: 52 | time.sleep(60) 53 | try: 54 | html = urllib2.urlopen(url, timeout=15).read() 55 | except: 56 | continue 57 | soup = BeautifulSoup(html) 58 | tr_level_list = soup.find_all('tr') 59 | td_level_list = [item.find_all('td') for item in tr_level_list] 60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5] 61 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS'] 62 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list] 63 | http_proxy_list.extend(com_str_list) 64 | write_proxy_into_file(http_proxy_list) 65 | if __name__ == "__main__": 66 | gen_proxy() 67 | 68 | -------------------------------------------------------------------------------- /proxy/xici.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import urllib2 4 | import os 5 | import re 6 | import codecs 7 | import time 8 | import random 9 | from bs4 import BeautifulSoup 10 | SLEEP_INTERVAL = random.randint(5,10) 11 | 12 | PATH = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | def request_in_douban(ip_port): 15 | '''过滤豆瓣被封的IP''' 16 | url = 'http://movie.douban.com/subject/6786002/' 17 | http_proxy = 'http://%s'%ip_port 18 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy}) 19 | opener = urllib2.build_opener(proxy_hanlder) 20 | urllib2.install_opener(opener) 21 | try: 22 | html = urllib2.urlopen(url, timeout=15) 23 | soup = BeautifulSoup(html) 24 | 25 | title = soup.find('span', property='v:itemreviewed') 26 | 27 | if not title: 28 | print 'invalid ip_port:%s'%ip_port 29 | return 30 | else: 31 | print 'success ip_port:%s'%ip_port 32 | return True 33 | except: 34 | print 'timed item_id...in ip_port:%s'%ip_port 35 | return 36 | 37 | def write_proxy_into_file(http_proxy_list): 38 | #过滤已被豆瓣查封的IP 39 | com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)] 40 | filename = os.path.join(PATH, 'sys', 'xici_proxy') 41 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf: 42 | wf.writelines(com_str_list) 43 | 44 | def gen_proxy(): 45 | url_pattern = "http://www.xici.net.co/nn/%s" 46 | url_list = [url_pattern%str(i) for i in range(1,4)] 47 | http_proxy_list = [] 48 | for url in url_list: 49 | try: 50 | html = urllib2.urlopen(url, timeout=15).read() 51 | except: 52 | time.sleep(60) 53 | try: 54 | html = urllib2.urlopen(url, timeout=15).read() 55 | except: 56 | continue 57 | soup = BeautifulSoup(html) 58 | tr_level_list = soup.find_all('tr') 59 | td_level_list = [item.find_all('td') for item in tr_level_list] 60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5] 61 | # print ip_port_type_list, len(ip_port_type_list) 62 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS'] 63 | # print matched_ip_port_list, len(matched_ip_port_list) 64 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list] 65 | http_proxy_list.extend(com_str_list) 66 | 67 | write_proxy_into_file(http_proxy_list) 68 | # time.sleep(SLEEP_INTERVAL) 69 | 70 | gen_proxy() 71 | 72 | -------------------------------------------------------------------------------- /douban/xici_proxy.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | #coding:utf-8 3 | import urllib2 4 | import os 5 | import re 6 | import codecs 7 | import time 8 | import random 9 | from bs4 import BeautifulSoup 10 | SLEEP_INTERVAL = random.randint(5,10) 11 | 12 | PATH = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | def request_in_douban(ip_port): 15 | '''过滤豆瓣被封的IP''' 16 | url = 'http://movie.douban.com/subject/6786002/' 17 | http_proxy = 'http://%s'%ip_port 18 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy}) 19 | opener = urllib2.build_opener(proxy_hanlder) 20 | urllib2.install_opener(opener) 21 | try: 22 | html = urllib2.urlopen(url, timeout=10) 23 | soup = BeautifulSoup(html) 24 | 25 | title = soup.find('span', property='v:itemreviewed') 26 | 27 | if not title: 28 | print 'invalid ip_port:%s'%ip_port 29 | return 30 | else: 31 | print 'success ip_port:%s'%ip_port 32 | return True 33 | except: 34 | print 'timed item_id...in ip_port:%s'%ip_port 35 | return 36 | 37 | def write_proxy_into_file(http_proxy_list): 38 | #过滤已被豆瓣查封的IP 39 | com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)] 40 | filename = os.path.join(PATH, 'sys', 'xici_proxy') 41 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf: 42 | wf.writelines(com_str_list) 43 | 44 | def gen_proxy(): 45 | url_pattern = "http://www.xici.net.co/nn/%s" 46 | url_list = [url_pattern%str(i) for i in range(1,6)] 47 | http_proxy_list = [] 48 | for url in url_list: 49 | try: 50 | html = urllib2.urlopen(url, timeout=15).read() 51 | except: 52 | time.sleep(60) 53 | try: 54 | html = urllib2.urlopen(url, timeout=15).read() 55 | except: 56 | continue 57 | soup = BeautifulSoup(html) 58 | tr_level_list = soup.find_all('tr') 59 | td_level_list = [item.find_all('td') for item in tr_level_list] 60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5] 61 | # print ip_port_type_list, len(ip_port_type_list) 62 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS'] 63 | # print matched_ip_port_list, len(matched_ip_port_list) 64 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list] 65 | http_proxy_list.extend(com_str_list) 66 | 67 | write_proxy_into_file(http_proxy_list) 68 | # time.sleep(SLEEP_INTERVAL) 69 | 70 | gen_proxy() 71 | 72 | -------------------------------------------------------------------------------- /dangdang/jewellery_glass_watch/jewellery/sys/jewellery_item_id: -------------------------------------------------------------------------------- 1 | http://product.dangdang.com/60545118.html 2 | http://product.dangdang.com/60329284.html 3 | http://product.dangdang.com/60561956.html 4 | http://product.dangdang.com/1135936507.html 5 | http://product.dangdang.com/60326136.html 6 | http://product.dangdang.com/60217143.html 7 | http://product.dangdang.com/60275012.html 8 | http://product.dangdang.com/60095332.html 9 | http://product.dangdang.com/60296404.html 10 | http://product.dangdang.com/60312091.html 11 | http://product.dangdang.com/1102856607.html 12 | http://product.dangdang.com/1103019307.html 13 | http://product.dangdang.com/60543004.html 14 | http://product.dangdang.com/60561747.html 15 | http://product.dangdang.com/60307526.html 16 | http://product.dangdang.com/60556879.html 17 | http://product.dangdang.com/60079298.html 18 | http://product.dangdang.com/60324171.html 19 | http://product.dangdang.com/60328975.html 20 | http://product.dangdang.com/60563407.html 21 | http://product.dangdang.com/1226033608.html 22 | http://product.dangdang.com/1103073107.html 23 | http://product.dangdang.com/1205883106.html 24 | http://product.dangdang.com/60543166.html 25 | http://product.dangdang.com/60545362.html 26 | http://product.dangdang.com/1219193022.html 27 | http://product.dangdang.com/60329283.html 28 | http://product.dangdang.com/60293398.html 29 | http://product.dangdang.com/60311152.html 30 | http://product.dangdang.com/60542979.html 31 | http://product.dangdang.com/60559721.html 32 | http://product.dangdang.com/1103111307.html 33 | http://product.dangdang.com/60554268.html 34 | http://product.dangdang.com/1104669412.html 35 | http://product.dangdang.com/60313678.html 36 | http://product.dangdang.com/60564171.html 37 | http://product.dangdang.com/1175804808.html 38 | http://product.dangdang.com/60304263.html 39 | http://product.dangdang.com/60542966.html 40 | http://product.dangdang.com/1062967608.html 41 | http://product.dangdang.com/60556746.html 42 | http://product.dangdang.com/60542968.html 43 | http://product.dangdang.com/1137365805.html 44 | http://product.dangdang.com/1025107506.html 45 | http://product.dangdang.com/60571706.html 46 | http://product.dangdang.com/1062940608.html 47 | http://product.dangdang.com/1365805202.html 48 | http://product.dangdang.com/1159089022.html 49 | http://product.dangdang.com/1080116522.html 50 | http://product.dangdang.com/1108646412.html 51 | http://product.dangdang.com/1066589907.html 52 | http://product.dangdang.com/1173284507.html 53 | http://product.dangdang.com/60323221.html 54 | http://product.dangdang.com/1112093507.html 55 | http://product.dangdang.com/60020047.html 56 | http://product.dangdang.com/60079299.html 57 | http://product.dangdang.com/60567912.html 58 | http://product.dangdang.com/60547504.html 59 | -------------------------------------------------------------------------------- /jd/read_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'huafeng' 2 | import re 3 | import urllib2 4 | import urllib 5 | import time 6 | import cookielib 7 | import ConfigParser 8 | from bs4 import BeautifulSoup 9 | 10 | def read_config(): 11 | config = ConfigParser.ConfigParser() 12 | print config.read('config.ini') 13 | 14 | spiders_name = config.sections() 15 | 16 | crawled_id_filename = config.get('computer','crawled_id_filename') 17 | print crawled_id_filename 18 | # read_config() 19 | def read_one_item_id(): 20 | url = 'http://list.jd.com/6233-6236-6254-0-0-0-0-0-0-0-1-1-625-1-1-72-4137-33.html' 21 | ip_port = '218.207.195.206:80' 22 | enable_proxy = False 23 | start_time = time.time() 24 | http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) 25 | null_http_hanlder = urllib2.ProxyHandler({}) 26 | if enable_proxy: 27 | opener = urllib2.build_opener(http_hanlder) 28 | else: 29 | opener = urllib2.build_opener(null_http_hanlder) 30 | urllib2.install_opener(opener) 31 | html = urllib2.urlopen(url).read() 32 | item_id_list = re.findall(r"sku='(\d+)'>