├── baidu
    ├── format.json
    ├── format.txt
    ├── sys
    │   ├── timeout_urls
    │   ├── xici_proxy
    │   └── proxy_ip_port
    ├── baidu_spider.py
    ├── baidu_tieba.py
    ├── out
    │   ├── 30
    │   ├── 50
    │   ├── 52
    │   └── 115
    ├── _gevent.py
    ├── xici_proxy.py
    ├── tst_bd.py
    └── filter_proxy.py
├── mop
    ├── hot_topic_urls
    ├── mop.py
    └── sys
    │   ├── hot_topic_urls
    │   ├── msg_urls
    │   └── comment_urls
├── jd
    ├── auto
    │   ├── log
    │   │   ├── crawled_page_url
    │   │   └── failed_page_url
    │   └── sys
    │   │   └── auto_item_id
    ├── electronic_jd
    │   ├── nohup.out
    │   ├── log
    │   │   ├── crawled_page_url
    │   │   ├── timeout_page_url
    │   │   └── elec_failed_url
    │   ├── jiadian_jd.py
    │   └── electronic_name.py
    ├── sports
    │   ├── sys
    │   │   └── sports_item_id
    │   └── log
    │   │   ├── crawled_page_url
    │   │   └── failed_page_url
    ├── computer
    │   ├── log
    │   │   ├── crawled_page_url
    │   │   ├── failed_id_url
    │   │   ├── timeout_page_url
    │   │   └── crawled_id_url
    │   ├── sys
    │   │   └── computer_item_id
    │   └── out
    │   │   └── 2014_06_03_16_comnputer
    ├── baby_toys
    │   ├── baby
    │   │   ├── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   │   └── sys
    │   │   │   └── baby_item_id
    │   └── toys
    │   │   ├── log
    │   │       ├── crawled_page_url
    │   │       └── failed_page_url
    │   │   └── sys
    │   │       └── toys_item_id
    ├── shoes_bag_watch
    │   ├── bag
    │   │   ├── sys
    │   │   │   └── bag_item_id
    │   │   └── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   ├── shoes
    │   │   ├── sys
    │   │   │   └── shoes_item_id
    │   │   └── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   └── watch
    │   │   ├── sys
    │   │       └── watch_item_id
    │   │   └── log
    │   │       ├── crawled_page_url
    │   │       └── failed_page_url
    ├── __init__.py
    ├── book_jd
    │   ├── __init__.py
    │   ├── xici_proxy.pyc
    │   ├── book_name.py
    │   ├── xici_proxy.py
    │   └── book_jd.py
    ├── clothing_jewellery
    │   ├── clothing
    │   │   ├── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   │   └── sys
    │   │   │   └── clothing_item_id
    │   └── jewellery
    │   │   ├── log
    │   │       ├── crawled_page_url
    │   │       └── failed_page_url
    │   │   └── sys
    │   │       └── jewellery_item_id
    ├── food_wine_freshfood
    │   └── freshfood
    │   │   ├── log
    │   │       ├── crawled_page_url
    │   │       └── failed_page_url
    │   │   └── sys
    │   │       └── freshfood_item_id
    ├── phone_digital
    │   └── digital
    │   │   └── __init__.py
    ├── home_furniture_decoration_kitchenware
    │   ├── decoration
    │   │   ├── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   │   └── sys
    │   │   │   └── decoration_item_id
    │   ├── furniture
    │   │   ├── log
    │   │   │   ├── crawled_page_url
    │   │   │   └── failed_page_url
    │   │   └── sys
    │   │   │   └── furniture_item_id
    │   └── kitchenware
    │   │   └── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    ├── config.ini
    ├── xici_proxy
    ├── read_failed_page_url.py
    ├── varify_proxy.py
    ├── read_config.py
    └── read_failed_item_id.py
├── __init__.py
├── baidu_word_freq
    ├── baidu_word_freq
    ├── sys
    │   ├── HZOut.txt
    │   └── word_without_freq_0528.txt
    ├── log
    │   └── timeout_url_0528
    ├── .idea
    │   ├── scopes
    │   │   └── scope_settings.xml
    │   ├── encodings.xml
    │   ├── vcs.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── baidu_word_freq.iml
    │   └── workspace.xml
    ├── multi_thread_get_freq.py
    ├── clear_data.py
    ├── get_word_freq.py
    └── tst_freq.py
├── dangdang
    ├── food
    │   ├── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    │   └── sys
    │   │   └── food_item_id
    ├── life
    │   ├── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    │   └── sys
    │   │   └── life_item_id
    ├── appliance
    │   ├── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    │   └── sys
    │   │   ├── appliance_item_id
    │   │   └── whole_page_url
    ├── clothing
    │   └── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    ├── computer
    │   ├── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    │   └── sys
    │   │   └── computer_item_id
    ├── sports
    │   ├── log
    │   │   ├── failed_page_url
    │   │   └── crawled_page_url
    │   └── sys
    │   │   └── clothing_item_id
    ├── baby_kid
    │   ├── baby
    │   │   ├── log
    │   │   │   ├── failed_page_url
    │   │   │   └── crawled_page_url
    │   │   └── sys
    │   │   │   └── baby_item_id
    │   └── kid
    │   │   ├── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    │   │   └── sys
    │   │       └── kid_item_id
    ├── book
    │   ├── log
    │   │   ├── crawled_failed_page_url
    │   │   ├── crawled_page_url
    │   │   └── failed_page_url
    │   ├── __init__.py
    │   └── sys
    │   │   ├── topic_url
    │   │   └── item_id_url
    ├── beauty_adult
    │   ├── adult
    │   │   ├── log
    │   │   │   ├── failed_page_url
    │   │   │   └── crawled_page_url
    │   │   └── sys
    │   │   │   └── adult_item_id
    │   └── beauty
    │   │   ├── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    │   │   └── sys
    │   │       └── beauty_item_id
    ├── living_auto
    │   ├── auto
    │   │   ├── log
    │   │   │   ├── failed_page_url
    │   │   │   └── crawled_page_url
    │   │   └── sys
    │   │   │   └── auto_item_id
    │   └── living
    │   │   ├── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    │   │   └── sys
    │   │       └── living_item_id
    ├── shoes_bag
    │   └── shoes
    │   │   └── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    ├── decoration_health
    │   └── health
    │   │   ├── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    │   │   └── sys
    │   │       └── health_item_id
    ├── mobile_digital
    │   ├── digital
    │   │   ├── log
    │   │   │   ├── failed_page_url
    │   │   │   └── crawled_page_url
    │   │   └── sys
    │   │   │   └── digital_item_id
    │   └── mobile
    │   │   ├── log
    │   │       ├── failed_page_url
    │   │       └── crawled_page_url
    │   │   └── sys
    │   │       └── mobile_item_id
    └── jewellery_glass_watch
    │   ├── glass_watch
    │       ├── log
    │       │   ├── failed_page_url
    │       │   └── crawled_page_url
    │       └── sys
    │       │   └── glass_watch_item_id
    │   └── jewellery
    │       ├── log
    │           ├── failed_page_url
    │           └── crawled_page_url
    │       └── sys
    │           └── jewellery_item_id
├── yhd
    ├── food_milk
    │   ├── log
    │   │   └── failed_page_url
    │   ├── __init__.py
    │   ├── tst_rewrite.py
    │   └── remove_repeat_ids.py
    └── __init__.py
├── douban
    ├── __init__.py
    ├── douban_book
    │   ├── __init__.py
    │   ├── readme.md
    │   ├── conf
    │   │   └── mysql.json
    │   ├── result.py
    │   ├── config.json
    │   ├── db
    │   │   ├── douban_book.sql
    │   │   └── load_config.py
    │   ├── result_db.py
    │   └── douban_books.py
    ├── douban.py
    ├── sys
    │   ├── xici_proxy
    │   └── douban_crawled_urls
    ├── movie_html.py
    ├── xici_proxy.py
    ├── douban_single_thread.py
    ├── movie_actors.py
    └── movie_actors_single_thread.py
├── sohu
    ├── __init__.py
    ├── sohu_spider.pyc
    ├── log
    │   └── sohu_url_crawled.txt
    ├── sohu.py
    └── sys
    │   ├── hot_topic_url
    │   └── realtime_page_url
├── proxy
    ├── __init__.py
    ├── sys
    │   └── proxy_ip_port
    ├── you.py
    ├── proxy.py
    ├── tst_proxy.py
    └── xici.py
├── README.md
├── requirements.txt
├── music
    ├── music.py
    ├── tst_music.py
    └── baidu_music.py
└── sina
    ├── zip_file.py
    └── sina_news.py


/baidu/format.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baidu/format.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mop/hot_topic_urls:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baidu/sys/timeout_urls:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/auto/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/auto/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/auto/sys/auto_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/electronic_jd/nohup.out:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/sports/sys/sports_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #!-*- coding:utf-8 -*-


--------------------------------------------------------------------------------
/baidu_word_freq/baidu_word_freq:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/food/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/life/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/computer/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/computer/log/failed_id_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/computer/log/timeout_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/sports/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/sports/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/yhd/food_milk/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/appliance/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/clothing/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/computer/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/sports/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/baby_toys/baby/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/baby_toys/baby/sys/baby_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/baby_toys/toys/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/baby_toys/toys/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/baby_toys/toys/sys/toys_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/electronic_jd/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/electronic_jd/log/timeout_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/bag/sys/bag_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/baby_kid/baby/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/baby_kid/kid/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/book/log/crawled_failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/douban/__init__.py:
--------------------------------------------------------------------------------
1 | #!-*- coding:utf-8 -*-


--------------------------------------------------------------------------------
/jd/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/jd/computer/sys/computer_item_id:
--------------------------------------------------------------------------------
1 | 1079594


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/bag/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/bag/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/shoes/sys/shoes_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/watch/sys/watch_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sohu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/yhd/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/baidu/baidu_spider.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/dangdang/beauty_adult/adult/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/beauty_adult/beauty/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/auto/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/living/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/shoes_bag/shoes/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/shoes/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/shoes/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/watch/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/shoes_bag_watch/watch/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/baidu/baidu_tieba.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 
3 | 


--------------------------------------------------------------------------------
/dangdang/book/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/dangdang/decoration_health/health/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/mobile_digital/digital/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/mobile_digital/mobile/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/douban/douban_book/__init__.py:
--------------------------------------------------------------------------------
1 | #!-*- coding:utf-8 -*-


--------------------------------------------------------------------------------
/jd/book_jd/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/clothing/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/clothing/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/clothing/sys/clothing_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/jewellery/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/jewellery/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/clothing_jewellery/jewellery/sys/jewellery_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/food_wine_freshfood/freshfood/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/food_wine_freshfood/freshfood/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/yhd/food_milk/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/jd/electronic_jd/jiadian_jd.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/jd/food_wine_freshfood/freshfood/sys/freshfood_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/glass_watch/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/jewellery/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/phone_digital/digital/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | 


--------------------------------------------------------------------------------
/jd/computer/log/crawled_id_url:
--------------------------------------------------------------------------------
1 | http://item.jd.com/1079594.html
2 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/decoration/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/decoration/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/furniture/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/furniture/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/furniture/sys/furniture_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/kitchenware/log/failed_page_url:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/decoration/sys/decoration_item_id:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dangdang/food/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4005726-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/computer/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4002590-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/sports/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4002385-pg1.html
2 | 


--------------------------------------------------------------------------------
/douban/douban_book/readme.md:
--------------------------------------------------------------------------------
1 | #豆瓣图书爬虫
2 | 
3 | ###抓取数据内容::
4 |     * 书名，图书id，作者，初版年，评分，评论人数等


--------------------------------------------------------------------------------
/dangdang/baby_kid/baby/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4002055-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/baby_kid/kid/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009359-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/book/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cp01.05.16.00.00.00.html
2 | 
3 | 


--------------------------------------------------------------------------------
/sohu/sohu_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/sohu/sohu_spider.pyc


--------------------------------------------------------------------------------
/dangdang/beauty_adult/adult/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009498-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/beauty_adult/beauty/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4003626-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/auto/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4006001-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/living/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009489-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/mobile_digital/digital/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4004279-pg1.html
2 | 


--------------------------------------------------------------------------------
/jd/book_jd/xici_proxy.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/jd/book_jd/xici_proxy.pyc


--------------------------------------------------------------------------------
/baidu_word_freq/sys/HZOut.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/sys/HZOut.txt


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/glass_watch/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4004292-pg1.html
2 | 


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/jewellery/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009624-pg1.html
2 | 


--------------------------------------------------------------------------------
/jd/baby_toys/baby/log/failed_page_url:
--------------------------------------------------------------------------------
1 | not match id_str in url;http://list.jd.com/list.html?cat=1319,1523,7052&page=1
2 | 


--------------------------------------------------------------------------------
/baidu_word_freq/log/timeout_url_0528:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/log/timeout_url_0528


--------------------------------------------------------------------------------
/dangdang/life/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4003357-pg1.html
2 | http://category.dangdang.com/cid4003359-pg1.html
3 | 


--------------------------------------------------------------------------------
/dangdang/appliance/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009643-pg1.html
2 | http://category.dangdang.com/cid4009643-pg2.html
3 | 


--------------------------------------------------------------------------------
/baidu_word_freq/sys/word_without_freq_0528.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghuafeng/e-business/HEAD/baidu_word_freq/sys/word_without_freq_0528.txt


--------------------------------------------------------------------------------
/dangdang/shoes_bag/shoes/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4001870-pg1.html
2 | http://category.dangdang.com/cid4001870-pg2.html
3 | 


--------------------------------------------------------------------------------
/dangdang/mobile_digital/mobile/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4001123-pg1.html
2 | http://category.dangdang.com/cid4001123-pg2.html
3 | 


--------------------------------------------------------------------------------
/douban/douban_book/conf/mysql.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"host":"127.0.0.1",
3 | 	"port":"3306",
4 | 	"database":"resultdb",
5 | 	"user":"root",
6 | 	"passwd":"root"
7 | }


--------------------------------------------------------------------------------
/dangdang/decoration_health/health/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009498-pg1.html
2 | http://category.dangdang.com/cid4009498-pg2.html
3 | 


--------------------------------------------------------------------------------
/proxy/sys/proxy_ip_port:
--------------------------------------------------------------------------------
1 | 1.179.147.2:8080
2 | 1.230.127.54:8080
3 | 5.101.130.95:80
4 | 5.223.112.253:8080
5 | 14.1.43.82:80
6 | 14.18.17.166:80
7 | 14.18.242.147:8080
8 | 


--------------------------------------------------------------------------------
/dangdang/clothing/log/crawled_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4008149-pg1.html
2 | http://category.dangdang.com/cid4008149-pg2.html
3 | http://category.dangdang.com/cid4008149-pg3.html
4 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="DependencyValidationManager">
2 |   <state>
3 |     <option name="SKIP_IMPORT_STATEMENTS" value="false" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
4 | </project>
5 | 
6 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="" />
5 |   </component>
6 | </project>
7 | 
8 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (/usr/bin/python2.7)" project-jdk-type="Python SDK" />
4 | </project>
5 | 
6 | 


--------------------------------------------------------------------------------
/sohu/log/sohu_url_crawled.txt:
--------------------------------------------------------------------------------
1 | 2014-04-24 12:32:26,713 - sohu - DEBUG - url crawled success in http://club.news.sohu.com/minjian/threads/p1?type=all&order=rtime
2 | 2014-04-25 09:02:50,783 - sohu - DEBUG - url crawled success in http://club.news.sohu.com/minjian/threads/p1?type=all&order=rtime
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 公网爬虫系统
 2 | 
 3 | * 电商商品信息抓取：
 4 |     * 京东
 5 |     * 国美
 6 |     * 一号店
 7 |     * 当当
 8 | * 论坛内容：
 9 |   * 猫扑
10 |   * 搜狐
11 | * IP代理(公网免费代理ip)
12 | * 新浪爬虫
13 |     *   每日实时新闻
14 |     *   全微博数据抓取
15 |     *   全网微博评论数据抓取
16 | * 百度音乐爬虫
17 | * 豆瓣电影
18 |     *   电影(最新以及历史电影)
19 |     *   图书
20 | 


--------------------------------------------------------------------------------
/jd/computer/out/2014_06_03_16_comnputer:
--------------------------------------------------------------------------------
 1 | 华硕（ASUS） R409VC 14英寸笔记本 （i5-3230M 4G 7200转500G GT720M 2G独显 D刻 深灰色）
 2 | 商品名称：华硕R409VC
 3 | 商品编号：1079594
 4 | 品牌：华硕（ASUS）
 5 | 上架时间：2014-03-14 09:52:55
 6 | 商品毛重：3.3kg
 7 | 商品产地：中国大陆
 8 | 显卡：性能级独显触控：非触控厚度：正常厚度（>25mm）处理器：Intel i5尺寸：14英寸
 9 | 
10 | 
11 | 如果您发现商品信息不准确，欢迎纠错
12 | 


--------------------------------------------------------------------------------
/mop/mop.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | 
 3 | from apscheduler.scheduler import Scheduler
 4 | 
 5 | from mop_crawler import MopCrawl
 6 | 
 7 | scheduler = Scheduler(daemonic = False)
 8 | 
 9 | @scheduler.cron_schedule(hour=23,minute='55')
10 | def mop_spider():
11 |     mop = MopCrawl()
12 |     mop.main()
13 | 
14 | scheduler.start()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | Flask
 3 | gevent
 4 | gunicorn
 5 | flask-redis
 6 | pymongo
 7 | MySQL-python==1.2.5
 8 | redis
 9 | SQLAlchemy
10 | Crypto
11 | pycrypto
12 | flask_sqlalchemy
13 | happybase==1.1.0
14 | pyexecjs
15 | beautifulsoup4
16 | lxml
17 | chardet
18 | pyDes
19 | Pillow
20 | pyspider==0.3.9
21 | mysql-connector==2.1.4


--------------------------------------------------------------------------------
/music/music.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | 
 3 | from apscheduler.scheduler import Scheduler
 4 | 
 5 | from baidu_music import BaiduMusic
 6 | 
 7 | scheduler = Scheduler(daemonic = False)
 8 | 
 9 | @scheduler.cron_schedule(hour=23,minute='20')
10 | def music_spider():
11 |     music = BaiduMusic()
12 |     music.main()
13 | 
14 | scheduler.start()
15 | 


--------------------------------------------------------------------------------
/sohu/sohu.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | 
 3 | from apscheduler.scheduler import Scheduler
 4 | 
 5 | from sohu_spider import SohuSpider
 6 | 
 7 | scheduler = Scheduler(daemonic = False)
 8 | 
 9 | @scheduler.cron_schedule(hour=23, minute='55')
10 | def sohu_crawler():
11 |     sohu = SohuSpider()
12 |     sohu.main()
13 | scheduler.start()
14 | 
15 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/modules.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectModuleManager">
 4 |     <modules>
 5 |       <module fileurl="file://$PROJECT_DIR$/.idea/baidu_word_freq.iml" filepath="$PROJECT_DIR$/.idea/baidu_word_freq.iml" />
 6 |     </modules>
 7 |   </component>
 8 | </project>
 9 | 
10 | 


--------------------------------------------------------------------------------
/douban/douban.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | 
 3 | from apscheduler.scheduler import Scheduler
 4 | 
 5 | from douban_spider import DoubanCrawler
 6 | 
 7 | scheduler = Scheduler(daemonic = False)
 8 | 
 9 | @scheduler.cron_schedule(hour=22, minute='55')
10 | def douban_crawler():
11 |     douban = DoubanCrawler()
12 |     douban.main()
13 | scheduler.start()
14 | 
15 | 


--------------------------------------------------------------------------------
/douban/douban_book/result.py:
--------------------------------------------------------------------------------
 1 | #!-*- coding:utf-8 -*-
 2 | import logging
 3 | from pyspider.result.result_worker import OneResultWorker
 4 | # from result_db import db
 5 | 
 6 | logger = logging.getLogger("ResultWorker")
 7 | 
 8 | class DoubanResultWorker(OneResultWorker):
 9 | 
10 |     def on_result(self, task, result):
11 |         if not result:
12 |             return
13 |         # db._replace(**result)
14 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/baidu_word_freq.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7.3 (/usr/bin/python2.7)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 | </module>
 9 | 
10 | 


--------------------------------------------------------------------------------
/baidu/out/115:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/115.htm", "header": "\u91c7\u4fdd\u8d39", "content": "\u91c7\u8d2d\u53ca\u4fdd\u7ba1\u8d39\u7684\u7b80\u79f0\u4e00\u822c\u7528\u4e8e\u5efa\u7b51\u884c\u4e1a\u4e0d\u540c\u7684\u5730\u65b9\u6709\u4e0d\u540c\u7684\u91c7\u4fdd\u8d39\u7387\u7684\u89c4\u5b9a\u4e00\u822c\u8ba1\u7b97\u516c\u5f0f\u662f\u91c7\u4fdd\u8d39=(\u539f\u4ef7+\u5305\u88c5\u8d39+\u4f9b\u9500\u90e8\u95e8\u624b\u7eed\u8d39+\u8fd0\u6742\u8d39)X\u91c7\u8d2d\u53ca\u4fdd\u7ba1\u8d39\u7387"}


--------------------------------------------------------------------------------
/dangdang/book/log/failed_page_url:
--------------------------------------------------------------------------------
1 | div do not match pattern in url;http://category.dangdang.com/pg79-cp01.43.50.00.00.00.html
2 | div do not match pattern in url;http://category.dangdang.com/pg33-cp01.43.64.00.00.00.html
3 | div do not match pattern in url;http://category.dangdang.com/pg62-cp01.43.67.00.00.00.html
4 | div do not match pattern in url;http://category.dangdang.com/pg68-cp01.21.08.00.00.00.html
5 | div do not match pattern in url;http://category.dangdang.com/pg68-cp01.54.02.00.00.00.html
6 | 


--------------------------------------------------------------------------------
/douban/douban_book/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resultdb": "mysql+resultdb://root:root@127.0.0.1:3306/resultdb",
 3 |     "taskdb": "mysql+taskdb://root:root@127.0.0.1:3306/taskdb",
 4 |     "projectdb": "mysql+projectdb://root:root@127.0.0.1:3306/projectdb",
 5 |     "message_queue": "redis://127.0.0.1:6379/0",
 6 |     "queue-maxsize": 600,
 7 |     "debug": true,
 8 |     "webui": {
 9 |         "port": 51001
10 |     },
11 |     "result_worker": {
12 |         "result_cls": "result.DoubanResultWorker"
13 |     }
14 | }


--------------------------------------------------------------------------------
/sohu/sys/hot_topic_url:
--------------------------------------------------------------------------------
 1 | http://club.news.sohu.com/minjian/threads
 2 | http://yule.club.sohu.com/bagua/threads
 3 | http://club.women.sohu.com/shopping/threads
 4 | http://club.cul.sohu.com/chuyushe/threads
 5 | http://club.baobao.sohu.com/mom_daugh/threads
 6 | http://club.sports.sohu.com/allgame/threads
 7 | http://club.chihe.sohu.com/food/threads
 8 | http://club.news.sohu.com/zz0580/threads
 9 | http://club.travel.sohu.com/togher/threads
10 | http://club.learning.sohu.com/szhengzhi/threads
11 | http://club.stock.sohu.com/stock/threads
12 | http://club.money.sohu.com/licai/threads
13 | http://club.women.sohu.com/zz0894/threads
14 | http://club.health.sohu.com/nutrition/threads
15 | 


--------------------------------------------------------------------------------
/yhd/food_milk/tst_rewrite.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | import  codecs
 3 | import os
 4 | PATH = os.path.dirname(os.path.abspath(__file__))
 5 | filename = os.path.join(PATH, 'item_id_bak/fruit_item_id')
 6 | with codecs.open(filename, encoding='utf-8') as f:
 7 |     item_id_list= f.readlines()
 8 |     item_id_set = set(item_id_list)
 9 |     list_length, set_length = len(item_id_list), len(item_id_set)
10 |     print list_length, set_length
11 |     # item_id_set = set(f.readlines())
12 |     # print len(item_id_list), len(item_id_set)
13 |     # item_id_set = set(f.readlines())
14 |     # with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
15 |     #     wf.writelines(item_id_set)
16 | 


--------------------------------------------------------------------------------
/mop/sys/hot_topic_urls:
--------------------------------------------------------------------------------
 1 | http://tt.mop.com/topic/list_209_210_0_0.html
 2 | http://tt.mop.com/topic/list_1_8_0_0.html
 3 | http://tt.mop.com/topic/list_94_48_0_0.html
 4 | http://tt.mop.com/topic/list_70_19_0_0.html
 5 | http://tt.mop.com/topic/list_209_43_0_0.html
 6 | http://tt.mop.com/topic/list_213_17_0_0.html
 7 | http://tt.mop.com/topic/list_412_51_0_0.html
 8 | http://tt.mop.com/topic/list_394_13_0_0.html
 9 | http://tt.mop.com/topic/list_213_214_0_0.html
10 | http://tt.mop.com/topic/list_213_217_0_0.html
11 | http://tt.mop.com/topic/list_462_438_0_0.html
12 | http://tt.mop.com/topic/list_291_294_0_0.html
13 | http://tt.mop.com/topic/list_1_14_0_0.html
14 | http://tt.mop.com/topic/list_237_238_0_0.html
15 | 


--------------------------------------------------------------------------------
/jd/config.ini:
--------------------------------------------------------------------------------
 1 | [root]
 2 | root_url_pattern = http://item.jd.com/%s.html
 3 | [book]
 4 | crawled_id_file = crawled_id_url
 5 | item_id_filename = book_item_ids
 6 | [computer]
 7 | crawled_id_filename = crawled_id_url
 8 | item_id_filename = computer_item_id
 9 | item_content_filename = comnputer
10 | failed_id_filename = failed_id_url
11 | [electronic]
12 | item_content_filename = electronic
13 | item_id_filename = electronic_item_id
14 | crawled_id_filename = crawled_id_url
15 | failed_id_filename = failed_id_url
16 | [digital]
17 | item_id_filename = digital_item_id
18 | crawled_id_filename = crawled_id_url
19 | failed_id_filename = failed_id_url
20 | [phone]
21 | item_id_filename = phone_item_id
22 | crawled_id_filename = crawled_id_url
23 | failed_id_filename = failed_id_url
24 | 


--------------------------------------------------------------------------------
/yhd/food_milk/remove_repeat_ids.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | import os
 3 | import sys
 4 | import codecs
 5 | 
 6 | PATH = os.path.dirname(os.path.abspath(__file__))
 7 | walk_path = os.path.join(PATH, 'item_id_bak')
 8 | def check_length():
 9 |     for root, subdirs, files in os.walk(walk_path):
10 |         for id_file in files:
11 |             id_filename = os.path.join(walk_path,id_file)
12 |             with codecs.open(id_filename) as f:
13 |                 item_id_list = f.readlines()
14 |                 item_id_set = set(item_id_list)
15 |                 list_length, set_length = len(item_id_list), len(item_id_set)
16 |                 print '%s:%s ==> %s'%(id_file, list_length, set_length)
17 |                 # with codecs.open(id_filename, mode='wb', encoding='utf-8') as wf:
18 |                 #     wf.writelines(item_id_set)
19 | check_length()
20 | 


--------------------------------------------------------------------------------
/douban/douban_book/db/douban_book.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | DROP TABLE if EXISTS douban_books;
 3 | CREATE TABLE `douban_books` (
 4 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 5 |   `book` varchar(32)  NOT NULL DEFAULT ''  COMMENT '图书名称',
 6 |   `book_id` varchar(16)  NOT NULL DEFAULT ''  COMMENT '图书唯一id',
 7 |   `author_name` varchar(32)  NOT NULL DEFAULT ''  COMMENT '作者名称',
 8 |   `publish_time` varchar(32)  NOT NULL DEFAULT ''  COMMENT '出版时间',
 9 |   `score` tinyint(4) NOT NULL DEFAULT 0  COMMENT '豆瓣评分',
10 |   `comment_count` int(10) NOT NULL DEFAULT 0  COMMENT '评论人数',
11 |   `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
12 |   `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
13 |   PRIMARY KEY (`id`),
14 |   KEY `idx_s` (`score`),
15 |   KEY `idx_ccnt` (`comment_count`),
16 |   KEY `idx_uptime` (`update_time`)
17 | )ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='豆瓣图书爬虫';


--------------------------------------------------------------------------------
/baidu/sys/xici_proxy:
--------------------------------------------------------------------------------
 1 | 221.7.11.72:80
 2 | 221.7.11.75:80
 3 | 221.7.11.10:82
 4 | 61.135.153.22:80
 5 | 221.7.11.68:83
 6 | 221.7.11.11:81
 7 | 221.7.11.74:80
 8 | 221.7.11.18:80
 9 | 221.7.11.8:80
10 | 122.96.59.106:80
11 | 122.96.59.102:83
12 | 182.98.163.166:3128
13 | 121.10.120.135:8001
14 | 218.207.195.206:80
15 | 221.7.11.110:80
16 | 122.96.59.99:80
17 | 112.95.241.76:80
18 | 222.87.129.30:80
19 | 221.7.11.71:80
20 | 221.7.11.17:80
21 | 221.7.11.22:80
22 | 221.7.11.19:80
23 | 121.14.228.16:21320
24 | 202.108.50.75:80
25 | 221.7.11.108:80
26 | 180.96.61.63:80
27 | 119.188.46.42:8080
28 | 61.174.9.96:8080
29 | 211.151.50.179:81
30 | 58.68.246.12:18080
31 | 202.171.253.103:80
32 | 115.236.59.194:3128
33 | 183.141.64.53:80
34 | 221.7.11.70:80
35 | 106.3.40.249:8081
36 | 222.66.115.233:80
37 | 114.80.136.112:7780
38 | 221.7.11.9:80
39 | 114.112.69.21:81
40 | 221.7.11.25:80
41 | 117.25.129.238:8888
42 | 210.14.138.102:8080
43 | 118.144.147.240:18186
44 | 221.176.14.72:80
45 | 202.98.123.126:8080
46 | 180.153.32.93:8088
47 | 


--------------------------------------------------------------------------------
/sina/zip_file.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import zipfile
 3 | import time
 4 | import os
 5 | 
 6 | PATH = os.path.dirname(os.path.abspath(__file__))
 7 | date_dir = time.strftime('%Y_%m_%d')
 8 | def zip_file():
 9 |     '''压缩data_dir目录下的shtml页面'''
10 |     src_file = os.path.join(PATH,'html', date_dir)
11 |     zipfilename = os.path.join(PATH, 'html', '%s.zip'%date_dir)
12 |     filelist = []
13 |     for root, dirs, files in os.walk(src_file):
14 |         for filename in files:
15 |            filelist.append(os.path.join(root, filename)) 
16 |     zf = zipfile.ZipFile(zipfilename, mode='w', compression=zipfile.ZIP_DEFLATED)
17 |     for fullpath_name in filelist:
18 |         arcname = fullpath_name[len(src_file):]
19 |         zf.write(fullpath_name, arcname)
20 |     zf.close()
21 | # zip_file()
22 | def remove_orginal_dir():
23 |     '''删除date_dir目录'''
24 |     date_dir_fullpath = os.path.join(PATH, 'html', date_dir)
25 |     print date_dir_fullpath
26 |     os.system('rm -r %s'%date_dir_fullpath)
27 | remove_orginal_dir()


--------------------------------------------------------------------------------
/baidu/_gevent.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | 
 3 | import time
 4 | import urllib2
 5 | import gevent
 6 | from bs4 import BeautifulSoup
 7 | import gevent.monkey
 8 | 
 9 | gevent.monkey.patch_socket()
10 | 
11 | hosts = ["http://www.baidu.com", "http://www.amazon.com","http://www.ibm.com",
12 |          "http://www.python.org","http://www.microsoft.com"]
13 | 
14 | def read(host):
15 |     try:
16 |         context = urllib2.urlopen(host)
17 |     except urllib2.URLError:
18 |         print "load %s failure." %host
19 |         return
20 |     try:
21 |         title = BeautifulSoup(context).title.string
22 |     except:
23 |         print "paser %s tile failure" %host
24 |         return
25 |     print "%s  : %s" %(host,title)
26 | 
27 | def concuyRead():
28 |     start = time.time()
29 |     threads = []
30 |     for i in range(30):
31 |         for host in hosts:
32 |             threads.append(gevent.spawn(read,host))
33 |     gevent.joinall(threads)
34 |     end = time.time()
35 |     print "Elapsed Time : %d" %(end-start)
36 | 
37 | if __name__ == '__main__':
38 |     concuyRead()
39 | 


--------------------------------------------------------------------------------
/jd/home_furniture_decoration_kitchenware/kitchenware/log/crawled_page_url:
--------------------------------------------------------------------------------
 1 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-1-1-1-72-4137-33.html
 2 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-2-1-1-72-4137-33.html
 3 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-3-1-1-72-4137-33.html
 4 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-4-1-1-72-4137-33.html
 5 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-5-1-1-72-4137-33.html
 6 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-6-1-1-72-4137-33.html
 7 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-7-1-1-72-4137-33.html
 8 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-8-1-1-72-4137-33.html
 9 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-9-1-1-72-4137-33.html
10 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-10-1-1-72-4137-33.html
11 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-11-1-1-72-4137-33.html
12 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-12-1-1-72-4137-33.html
13 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-13-1-1-72-4137-33.html
14 | http://list.jd.com/6196-6197-6199-0-0-0-0-0-0-0-1-1-14-1-1-72-4137-33.html
15 | 


--------------------------------------------------------------------------------
/douban/douban_book/db/load_config.py:
--------------------------------------------------------------------------------
 1 | #!-*- coding:utf-8 -*-
 2 | import os
 3 | import re
 4 | import codecs
 5 | LINE_PATTERN = re.compile(r'\A\s*(?P<key>.+?)\s*=\s*(?P<val>.*?)\s*\Z')
 6 | 
 7 | 
 8 | def load_config(config_file_path):
 9 |     if not os.path.isfile(config_file_path):
10 |         raise ValueError('Error: file not found %s' % config_file_path)
11 |     entries = {}
12 |     line_num = 0
13 |     for line in codecs.open(config_file_path, mode='r', encoding='utf_8', errors='ignore'):
14 |         line_num += 1
15 |         line = line.strip()
16 |         if not line:
17 |             continue     # skip empty line
18 |         if line[0] == ';' or line[0] == '#':
19 |             continue    # skip comment line
20 |         match = LINE_PATTERN.match(line)
21 |         if not match:
22 |             print('Warning(file %s, line %d): invalid entry' % (config_file_path, line_num))
23 |             continue
24 |         key = match.group('key')
25 |         if key in entries:
26 |             print('Warning(file %s, line %d): duplicated entry ignored for "%s"' % (config_file_path, line_num, key))
27 |             continue
28 |         entries[key] = match.group('val')
29 |     return entries
30 | 


--------------------------------------------------------------------------------
/baidu/sys/proxy_ip_port:
--------------------------------------------------------------------------------
 1 | 42.3.224.11:8080
 2 | 42.120.22.25:3128
 3 | 42.121.59.142:82
 4 | 58.22.0.55:80
 5 | 58.53.128.201:8888
 6 | 58.205.228.66:8000
 7 | 58.205.228.66:8080
 8 | 58.205.228.67:8000
 9 | 58.205.228.67:8080
10 | 58.215.52.159:8080
11 | 59.38.32.35:1111
12 | 59.46.72.245:8080
13 | 59.78.194.89:80
14 | 59.172.208.186:8080
15 | 60.18.147.109:8085
16 | 60.21.136.22:8080
17 | 61.53.64.37:8080
18 | 61.134.38.42:7280
19 | 61.153.236.30:8080
20 | 61.156.235.170:9999
21 | 61.156.235.172:9999
22 | 61.158.168.237:9999
23 | 61.158.219.226:8118
24 | 61.164.73.19:82
25 | 63.221.140.143:80
26 | 103.27.108.120:80
27 | 106.37.177.251:3128
28 | 106.120.108.163:3128
29 | 111.1.60.210:80
30 | 111.161.126.83:8080
31 | 111.161.126.87:8080
32 | 111.161.126.88:8080
33 | 111.161.126.90:8080
34 | 111.161.126.92:8080
35 | 111.161.126.93:8080
36 | 112.5.183.235:80
37 | 112.5.183.235:81
38 | 112.5.183.235:82
39 | 112.90.146.76:3128
40 | 112.124.3.76:80
41 | 112.124.8.200:8080
42 | 112.124.27.112:82
43 | 112.124.28.212:80
44 | 112.124.59.13:9090
45 | 112.124.103.195:80
46 | 112.125.17.5:80
47 | 112.253.6.182:8080
48 | 113.57.252.103:80
49 | 113.57.252.104:80
50 | 113.57.252.105:80
51 | 113.57.252.107:80
52 | 113.107.43.75:818
53 | 113.108.181.171:1127
54 | 


--------------------------------------------------------------------------------
/mop/sys/msg_urls:
--------------------------------------------------------------------------------
 1 | http://tt.mop.com/topic/list_209_210_0_1_2.html
 2 | http://tt.mop.com/topic/list_209_210_0_2_2.html
 3 | http://tt.mop.com/topic/list_1_8_0_1_2.html
 4 | http://tt.mop.com/topic/list_1_8_0_2_2.html
 5 | http://tt.mop.com/topic/list_94_48_0_1_2.html
 6 | http://tt.mop.com/topic/list_94_48_0_2_2.html
 7 | http://tt.mop.com/topic/list_70_19_0_1_2.html
 8 | http://tt.mop.com/topic/list_70_19_0_2_2.html
 9 | http://tt.mop.com/topic/list_209_43_0_1_2.html
10 | http://tt.mop.com/topic/list_209_43_0_2_2.html
11 | http://tt.mop.com/topic/list_213_17_0_1_2.html
12 | http://tt.mop.com/topic/list_213_17_0_2_2.html
13 | http://tt.mop.com/topic/list_412_51_0_1_2.html
14 | http://tt.mop.com/topic/list_412_51_0_2_2.html
15 | http://tt.mop.com/topic/list_394_13_0_1_2.html
16 | http://tt.mop.com/topic/list_394_13_0_2_2.html
17 | http://tt.mop.com/topic/list_213_214_0_1_2.html
18 | http://tt.mop.com/topic/list_213_214_0_2_2.html
19 | http://tt.mop.com/topic/list_213_217_0_1_2.html
20 | http://tt.mop.com/topic/list_213_217_0_2_2.html
21 | http://tt.mop.com/topic/list_462_438_0_1_2.html
22 | http://tt.mop.com/topic/list_462_438_0_2_2.html
23 | http://tt.mop.com/topic/list_291_294_0_1_2.html
24 | http://tt.mop.com/topic/list_291_294_0_2_2.html
25 | http://tt.mop.com/topic/list_1_14_0_1_2.html
26 | http://tt.mop.com/topic/list_1_14_0_2_2.html
27 | http://tt.mop.com/topic/list_237_238_0_1_2.html
28 | http://tt.mop.com/topic/list_237_238_0_2_2.html
29 | 


--------------------------------------------------------------------------------
/douban/sys/xici_proxy:
--------------------------------------------------------------------------------
 1 | 210.73.220.18:8088
 2 | 115.227.193.238:80
 3 | 60.5.252.76:80
 4 | 58.20.223.230:3128
 5 | 221.7.11.11:81
 6 | 58.20.127.90:3128
 7 | 211.151.50.179:81
 8 | 120.198.230.11:80
 9 | 122.96.59.102:83
10 | 122.96.59.106:80
11 | 115.238.243.202:80
12 | 210.14.138.102:8080
13 | 58.20.127.178:3128
14 | 122.96.59.99:80
15 | 221.10.40.234:843
16 | 183.141.68.197:80
17 | 112.4.17.152:8080
18 | 218.92.169.170:18186
19 | 121.14.145.96:8090
20 | 218.28.96.39:3128
21 | 61.164.73.19:82
22 | 58.53.128.201:8888
23 | 116.213.211.139:9090
24 | 210.39.18.236:8080
25 | 106.3.40.249:8081
26 | 222.66.115.233:80
27 | 218.64.255.253:3128
28 | 119.188.46.42:8080
29 | 114.80.136.112:7780
30 | 221.130.29.184:8888
31 | 117.25.129.238:8888
32 | 115.29.178.235:8088
33 | 220.161.209.150:80
34 | 180.166.67.100:8080
35 | 114.112.91.135:3128
36 | 222.87.129.29:80
37 | 202.98.123.126:8080
38 | 59.127.173.42:8888
39 | 202.202.0.163:3128
40 | 202.108.50.75:80
41 | 42.121.105.155:8888
42 | 118.26.57.14:80
43 | 218.108.170.166:82
44 | 125.39.66.67:80
45 | 122.226.120.4:80
46 | 115.29.164.195:8081
47 | 211.152.50.70:80
48 | 58.20.127.26:3128
49 | 115.236.59.194:3128
50 | 58.20.127.100:3128
51 | 125.39.66.66:80
52 | 58.20.127.106:3128
53 | 61.174.9.96:8080
54 | 140.206.86.68:8080
55 | 121.10.120.135:8001
56 | 58.22.0.54:80
57 | 115.29.3.163:80
58 | 112.95.241.76:80
59 | 202.96.155.251:8888
60 | 218.213.168.131:80
61 | 218.204.89.117:8123
62 | 58.221.250.246:18186
63 | 


--------------------------------------------------------------------------------
/dangdang/life/sys/life_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/60542570.html
 2 | http://product.dangdang.com/400926082.html
 3 | http://product.dangdang.com/400926055.html
 4 | http://product.dangdang.com/60542568.html
 5 | http://product.dangdang.com/60542573.html
 6 | http://product.dangdang.com/60542572.html
 7 | http://product.dangdang.com/60542569.html
 8 | http://product.dangdang.com/400926086.html
 9 | http://product.dangdang.com/400926070.html
10 | http://product.dangdang.com/400926080.html
11 | http://product.dangdang.com/60542571.html
12 | http://product.dangdang.com/60543050.html
13 | http://product.dangdang.com/60543048.html
14 | http://product.dangdang.com/60543057.html
15 | http://product.dangdang.com/60543055.html
16 | http://product.dangdang.com/60543059.html
17 | http://product.dangdang.com/60543052.html
18 | http://product.dangdang.com/60543049.html
19 | http://product.dangdang.com/60543051.html
20 | http://product.dangdang.com/60543053.html
21 | http://product.dangdang.com/60543060.html
22 | http://product.dangdang.com/60543054.html
23 | http://product.dangdang.com/60543056.html
24 | http://product.dangdang.com/60543058.html
25 | http://product.dangdang.com/1800382664.html
26 | http://product.dangdang.com/60542570.html
27 | http://product.dangdang.com/60542568.html
28 | http://product.dangdang.com/60542573.html
29 | http://product.dangdang.com/60542572.html
30 | http://product.dangdang.com/60542569.html
31 | http://product.dangdang.com/60542571.html
32 | 


--------------------------------------------------------------------------------
/jd/xici_proxy:
--------------------------------------------------------------------------------
 1 | 202.108.50.75:80
 2 | 61.135.153.22:80
 3 | 221.7.11.19:80
 4 | 42.121.105.155:8888
 5 | 117.59.217.214:80
 6 | 114.80.136.112:7780
 7 | 218.207.195.206:80
 8 | 218.207.195.221:80
 9 | 59.125.43.139:8080
10 | 221.7.11.22:80
11 | 125.39.70.182:8080
12 | 112.95.241.76:80
13 | 58.241.243.218:8083
14 | 58.22.0.55:83
15 | 114.112.91.135:3128
16 | 221.130.29.184:8888
17 | 121.14.228.16:21320
18 | 183.230.127.59:8088
19 | 210.14.138.102:8080
20 | 61.177.65.251:18186
21 | 58.20.127.100:3128
22 | 58.53.128.201:8888
23 | 115.29.164.195:8081
24 | 58.20.127.106:3128
25 | 220.161.209.150:80
26 | 58.221.250.246:18186
27 | 202.108.50.75:80
28 | 222.66.115.233:80
29 | 218.207.195.206:80
30 | 58.20.223.230:3128
31 | 221.7.11.11:81
32 | 58.53.128.201:8888
33 | 58.20.127.90:3128
34 | 58.22.106.230:8888
35 | 117.25.129.238:8888
36 | 58.68.246.12:18080
37 | 122.96.59.106:80
38 | 122.96.59.102:83
39 | 210.14.138.102:8080
40 | 221.7.11.71:80
41 | 122.96.59.99:80
42 | 222.87.129.30:80
43 | 61.135.153.22:80
44 | 114.112.69.21:81
45 | 115.29.168.245:18080
46 | 220.165.8.174:80
47 | 115.29.3.163:80
48 | 58.20.127.106:3128
49 | 221.130.29.184:8888
50 | 58.20.127.26:3128
51 | 61.174.9.96:8080
52 | 202.98.123.126:8080
53 | 211.139.93.87:18186
54 | 180.153.32.93:8088
55 | 182.98.163.166:3128
56 | 61.164.73.19:82
57 | 211.151.50.179:81
58 | 210.73.220.18:8088
59 | 106.3.40.249:8081
60 | 121.10.120.135:8001
61 | 58.215.139.136:8088
62 | 221.7.11.19:80
63 | 59.125.43.139:8080
64 | 221.7.11.22:80
65 | 112.95.241.76:80
66 | 58.241.243.218:8083
67 | 183.230.127.59:8088
68 | 61.177.65.251:18186
69 | 58.20.127.100:3128
70 | 220.161.209.150:80
71 | 58.221.250.246:18186
72 | 180.96.61.63:80
73 | 


--------------------------------------------------------------------------------
/douban/movie_html.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | import os
 3 | import re
 4 | import time
 5 | import urllib2
 6 | import codecs
 7 | 
 8 | PATH = os.path.dirname(os.path.abspath(__file__))
 9 | 
10 | def main():
11 |     item_url_filename = os.path.join(PATH, 'sys', 'total_item_url')
12 |     html_failed_filename = os.path.join(PATH, 'log', 'html_failed_url')
13 |     with codecs.open(item_url_filename, encoding='utf-8') as total_item_url_f,\
14 |     codecs.open(html_failed_filename, mode='wb', encoding='utf-8')as failed_wf:
15 |         url_list = [item.strip() for item in set(total_item_url_f.readlines())]
16 |         for url in url_list:
17 |             item_id = re.search('\d+', url).group()
18 |             try:
19 |                 html = urllib2.urlopen(url).read()
20 |             except BaseException:
21 |                 try:
22 |                     html = urllib2.urlopen(url).read()
23 |                 except BaseException:
24 |                     try:
25 |                         html = urllib2.urlopen(url).read()
26 |                     except BaseException:
27 |                         failed_wf.write(url+'\n')
28 |                         continue
29 |             html_filename = os.path.join(PATH, 'html', '%s.html'%str(item_id))
30 |             with open(html_filename, mode='wb') as wf:
31 |                 wf.write(html)
32 |             time.sleep(3)
33 | main()
34 | 
35 | def read_one_item_url():
36 |     url = 'http://movie.douban.com/subject/22265121/'
37 |     item_id = re.search('\d+', url).group()
38 |     filename = os.path.join(PATH, '%s.html'%item_id)
39 |     html = urllib2.urlopen(url).read()
40 |     with codecs.open(filename, mode='wb') as wf:
41 |         wf.write(html)
42 | # read_one_item_url()


--------------------------------------------------------------------------------
/proxy/you.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import time
 7 | import codecs
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | VARIFY_URL = "http://42.96.192.46/echo_ip"
12 | 
13 | def Varify_proxy(ip_port):
14 |     http_proxy = 'http://%s'%ip_port
15 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
16 |     opener = urllib2.build_opener(proxy_hanlder)
17 |     urllib2.install_opener(opener)
18 |     try:
19 |         ip = urllib2.urlopen(VARIFY_URL, timeout=10).read().strip()
20 |         if ip == ip_port.split(':')[0]:
21 |             print 'successed ip : %s'%ip
22 |             return True
23 |         else:
24 |             print 'failed ip : %s'%ip
25 |             return
26 |     except:
27 |         print 'timeout request...ip:%s'%ip_port
28 |         return
29 | 
30 | def baidu_varify(ip_port):
31 |     '''对百度进行过滤'''
32 |     url = 'http://baike.baidu.com/view/3570272.htm'
33 |     http_proxy = 'http://%s'%ip_port
34 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
35 |     opener = urllib2.build_opener(proxy_hanlder)
36 |     urllib2.install_opener(opener)
37 |     try:
38 |         html = urllib2.urlopen(url, timeout=10)
39 |         soup = BeautifulSoup(html)
40 |         para_level_list = soup.find_all('div', class_='para')
41 |         if not para_level_list:
42 |             return
43 |         else:
44 |             return True
45 |     except:
46 |         print 'timed item_id...in ip_port:%s'%ip_port
47 |         return
48 | def read_one_page():
49 |     url = 'http://www.youdaili.cn/Daili/guonei/list_1.html'
50 |     html = urllib2.urlopen(url).read()
51 |     soup = BeautifulSoup(html)
52 |     ul_level_str = soup.find('ul', class_='newslist_line')
53 |     li_level_srt = ul_level_str.find('li')
54 |     print li_level_srt.a['href']
55 | 
56 | read_one_page()


--------------------------------------------------------------------------------
/douban/douban_book/result_db.py:
--------------------------------------------------------------------------------
 1 | #!-*- coding:utf-8 -*-
 2 | import os
 3 | import json
 4 | import mysql.connector
 5 | from pyspider.database.mysql.resultdb import BaseDB
 6 | 
 7 | cur_path = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | class DoubanResultDB(BaseDB):
10 |     __tablename__ = 'douban_books'
11 | 
12 |     def __init__(self, host='localhost', port=3306, database='resultdb',
13 |                  user='root', passwd=None):
14 |         self.config_jdata = self._load_config()
15 |         host = self.config_jdata.get('host') or host
16 |         port = self.config_jdata.get('port') or port
17 |         database = self.config_jdata.get('database') or database
18 |         user = self.config_jdata.get('user') or user
19 |         passwd = self.config_jdata.get('passwd') or passwd
20 |         self.database_name = database
21 |         self.conn = mysql.connector.connect(user=user, password=passwd,
22 |                                             host=host, port=port, autocommit=True,
23 |                                             database=database)
24 | 
25 |     def _load_config(self):
26 |         config_path = os.path.join(cur_path, 'conf/mysql.json')
27 |         con = open(config_path).read()
28 |         return json.loads(con)
29 | 
30 |     @property
31 |     def dbcur(self):
32 |         try:
33 |             if self.conn.unread_result:
34 |                 self.conn.get_rows()
35 |             return self.conn.cursor()
36 |         except (mysql.connector.OperationalError, mysql.connector.InterfaceError):
37 |             self.conn.ping(reconnect=True)
38 |             self.conn.database = self.database_name
39 |             return self.conn.cursor()
40 | 
41 | 
42 | db = DoubanResultDB()
43 | 
44 | if __name__  == "__main__":
45 |     data = {
46 |         'book': 'book',
47 |         'book_id': '12312',
48 |         'author_name': 'huafeng',
49 |         'score': '9.1',
50 |         'comment_count': '13312312',
51 |         'publish_time': '2'
52 |     }
53 |     db._replace(**data)
54 |     # sql = 'REPLACE INTO `douban_books` (`book_id`) VALUES (13312312)'
55 |     # douban_db.dbcur.execute(sql)


--------------------------------------------------------------------------------
/music/tst_music.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import urllib2
 6 | import codecs
 7 | import time
 8 | import random
 9 | from bs4 import BeautifulSoup
10 | 
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | 
13 | def gen_item_url():
14 |     day_hot_url = "http://music.baidu.com/top/dayhot"
15 |     new_url = "http://music.baidu.com/top/new"
16 |     url_root = 'http://music.baidu.com/'
17 |     url_compile_filter = re.compile(r'/song/[\d]+')
18 |     html = urllib2.urlopen(new_url).read()
19 |     soup = BeautifulSoup(html)
20 |     span_level_list = soup.find_all('span', class_='song-title')
21 |     # span_level_str = '''<span class="song-title " style="width: 240px;">
22 |     # <a href="/song/14385500" title="时间都去哪儿了">时间都去哪儿了</a>
23 |     # <a class="mv-icon" href="/mv/14385500"
24 |     # target="_blank" title="歌曲MV"></a></span>'''
25 |     whole_url_list = [item.a['href'] for item in span_level_list]
26 |     item_url_list = ["".join((url_root, url)) for url in whole_url_list if url_compile_filter.match(url)]
27 |     print item_url_list, len(item_url_list)
28 | # gen_item_url()
29 | def parse_one_page():
30 |     url = 'http://music.baidu.com/song/118422882'
31 |     html = urllib2.urlopen(url).read()
32 |     soup = BeautifulSoup(html)
33 |     div_level_str = soup.find('div', id='lyricCont')
34 |     music_con = div_level_str.text
35 |     splited_con = music_con.split('\n')
36 |     print len(splited_con),splited_con[20]
37 |     for i in splited_con:
38 |         print i
39 | # parse_one_page()
40 | def filter_unformal_url():
41 |     unformal_url = "http://y.baidu.com/gotoartist/song/23648?play_song=23648&pst=songList"
42 |     formal_url = '/song/118170108'
43 |     url_compile_filter = re.compile(r'/song/[\d]+')
44 |     match = url_compile_filter.match(formal_url)
45 |     if match:
46 |         print match.group()
47 | # filter_unformal_url()
48 | def write_music_content_into_file():
49 |     timestamp = time.strftime('%Y_%m_%d_%H%M%S_baidu_music.txt')
50 |     filename = os.path.join(PATH, 'out', timestamp)
51 |     print filename
52 | # write_music_content_into_file()
53 | 


--------------------------------------------------------------------------------
/mop/sys/comment_urls:
--------------------------------------------------------------------------------
 1 | http://tt.mop.com/topic/list_209_210_0_1_1.html
 2 | http://tt.mop.com/topic/list_209_210_0_2_1.html
 3 | http://tt.mop.com/topic/list_209_210_0_3_1.html
 4 | http://tt.mop.com/topic/list_1_8_0_1_1.html
 5 | http://tt.mop.com/topic/list_1_8_0_2_1.html
 6 | http://tt.mop.com/topic/list_1_8_0_3_1.html
 7 | http://tt.mop.com/topic/list_94_48_0_1_1.html
 8 | http://tt.mop.com/topic/list_94_48_0_2_1.html
 9 | http://tt.mop.com/topic/list_94_48_0_3_1.html
10 | http://tt.mop.com/topic/list_70_19_0_1_1.html
11 | http://tt.mop.com/topic/list_70_19_0_2_1.html
12 | http://tt.mop.com/topic/list_70_19_0_3_1.html
13 | http://tt.mop.com/topic/list_209_43_0_1_1.html
14 | http://tt.mop.com/topic/list_209_43_0_2_1.html
15 | http://tt.mop.com/topic/list_209_43_0_3_1.html
16 | http://tt.mop.com/topic/list_213_17_0_1_1.html
17 | http://tt.mop.com/topic/list_213_17_0_2_1.html
18 | http://tt.mop.com/topic/list_213_17_0_3_1.html
19 | http://tt.mop.com/topic/list_412_51_0_1_1.html
20 | http://tt.mop.com/topic/list_412_51_0_2_1.html
21 | http://tt.mop.com/topic/list_412_51_0_3_1.html
22 | http://tt.mop.com/topic/list_394_13_0_1_1.html
23 | http://tt.mop.com/topic/list_394_13_0_2_1.html
24 | http://tt.mop.com/topic/list_394_13_0_3_1.html
25 | http://tt.mop.com/topic/list_213_214_0_1_1.html
26 | http://tt.mop.com/topic/list_213_214_0_2_1.html
27 | http://tt.mop.com/topic/list_213_214_0_3_1.html
28 | http://tt.mop.com/topic/list_213_217_0_1_1.html
29 | http://tt.mop.com/topic/list_213_217_0_2_1.html
30 | http://tt.mop.com/topic/list_213_217_0_3_1.html
31 | http://tt.mop.com/topic/list_462_438_0_1_1.html
32 | http://tt.mop.com/topic/list_462_438_0_2_1.html
33 | http://tt.mop.com/topic/list_462_438_0_3_1.html
34 | http://tt.mop.com/topic/list_291_294_0_1_1.html
35 | http://tt.mop.com/topic/list_291_294_0_2_1.html
36 | http://tt.mop.com/topic/list_291_294_0_3_1.html
37 | http://tt.mop.com/topic/list_1_14_0_1_1.html
38 | http://tt.mop.com/topic/list_1_14_0_2_1.html
39 | http://tt.mop.com/topic/list_1_14_0_3_1.html
40 | http://tt.mop.com/topic/list_237_238_0_1_1.html
41 | http://tt.mop.com/topic/list_237_238_0_2_1.html
42 | http://tt.mop.com/topic/list_237_238_0_3_1.html
43 | 


--------------------------------------------------------------------------------
/baidu_word_freq/.idea/workspace.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ChangeListManager">
 4 |     <option name="TRACKING_ENABLED" value="true" />
 5 |     <option name="SHOW_DIALOG" value="false" />
 6 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 7 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 8 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 9 |   </component>
10 |   <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
11 |   <component name="CreatePatchCommitExecutor">
12 |     <option name="PATCH_PATH" value="" />
13 |   </component>
14 |   <component name="DaemonCodeAnalyzer">
15 |     <disable_hints />
16 |   </component>
17 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
18 |     <OptionsSetting value="true" id="Add" />
19 |     <OptionsSetting value="true" id="Remove" />
20 |     <OptionsSetting value="true" id="Checkout" />
21 |     <OptionsSetting value="true" id="Update" />
22 |     <OptionsSetting value="true" id="Status" />
23 |     <OptionsSetting value="true" id="Edit" />
24 |     <ConfirmationsSetting value="0" id="Add" />
25 |     <ConfirmationsSetting value="0" id="Remove" />
26 |   </component>
27 |   <component name="ProjectReloadState">
28 |     <option name="STATE" value="0" />
29 |   </component>
30 |   <component name="RunManager">
31 |     <list size="0" />
32 |   </component>
33 |   <component name="ShelveChangesManager" show_recycled="false" />
34 |   <component name="TaskManager">
35 |     <task active="true" id="Default" summary="Default task">
36 |       <created>1397473323555</created>
37 |       <updated>1397473323555</updated>
38 |     </task>
39 |     <servers />
40 |   </component>
41 |   <component name="VcsContentAnnotationSettings">
42 |     <option name="myLimit" value="2678400000" />
43 |   </component>
44 |   <component name="VcsManagerConfiguration">
45 |     <option name="myTodoPanelSettings">
46 |       <TodoPanelSettings />
47 |     </option>
48 |   </component>
49 |   <component name="XDebuggerManager">
50 |     <breakpoint-manager />
51 |   </component>
52 | </project>
53 | 
54 | 


--------------------------------------------------------------------------------
/dangdang/baby_kid/baby/sys/baby_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/60313937.html
 2 | http://product.dangdang.com/60313936.html
 3 | http://product.dangdang.com/20647147.html
 4 | http://product.dangdang.com/20647146.html
 5 | http://product.dangdang.com/60203040.html
 6 | http://product.dangdang.com/60203039.html
 7 | http://product.dangdang.com/60313938.html
 8 | http://product.dangdang.com/60304433.html
 9 | http://product.dangdang.com/20647154.html
10 | http://product.dangdang.com/20226095.html
11 | http://product.dangdang.com/60067062.html
12 | http://product.dangdang.com/60566394.html
13 | http://product.dangdang.com/60304434.html
14 | http://product.dangdang.com/60142495.html
15 | http://product.dangdang.com/60313940.html
16 | http://product.dangdang.com/60552181.html
17 | http://product.dangdang.com/60067061.html
18 | http://product.dangdang.com/60313945.html
19 | http://product.dangdang.com/60555576.html
20 | http://product.dangdang.com/60203038.html
21 | http://product.dangdang.com/60313935.html
22 | http://product.dangdang.com/60067063.html
23 | http://product.dangdang.com/60200958.html
24 | http://product.dangdang.com/60272250.html
25 | http://product.dangdang.com/60272249.html
26 | http://product.dangdang.com/60567884.html
27 | http://product.dangdang.com/60272248.html
28 | http://product.dangdang.com/20226094.html
29 | http://product.dangdang.com/60332633.html
30 | http://product.dangdang.com/60555584.html
31 | http://product.dangdang.com/20161237.html
32 | http://product.dangdang.com/20226097.html
33 | http://product.dangdang.com/60555577.html
34 | http://product.dangdang.com/60272247.html
35 | http://product.dangdang.com/20226096.html
36 | http://product.dangdang.com/60034385.html
37 | http://product.dangdang.com/1291165605.html
38 | http://product.dangdang.com/60555575.html
39 | http://product.dangdang.com/1291108305.html
40 | http://product.dangdang.com/60555578.html
41 | http://product.dangdang.com/60032991.html
42 | http://product.dangdang.com/60083212.html
43 | http://product.dangdang.com/60566393.html
44 | http://product.dangdang.com/60561100.html
45 | http://product.dangdang.com/60555574.html
46 | http://product.dangdang.com/60332625.html
47 | http://product.dangdang.com/1291140005.html
48 | http://product.dangdang.com/60555585.html
49 | 


--------------------------------------------------------------------------------
/proxy/proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import urllib
 5 | import os
 6 | import re
 7 | import socket
 8 | import time
 9 | 
10 | 
11 | def VerifyIp():
12 |     proxy = {'domain':"115.29.3.163", 'port':"80"} # invalid
13 |     url = 'http://42.96.192.46/echo_ip'
14 |     loc_ip = urllib.urlopen(url).read().strip()
15 |     print "loc_ip is :%s"%loc_ip
16 | 
17 |     start_time = time.time()
18 |     http_proxy = 'http://%s:%s/' % (proxy['domain'], proxy['port'])
19 |     proxy_support = urllib2.ProxyHandler({'http': http_proxy})
20 |     opener = urllib2.build_opener(proxy_support)
21 |     urllib2.install_opener(opener)
22 |     request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'})
23 |     ip = urllib2.urlopen(url).read().strip()
24 |     if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip) and ip != loc_ip:
25 |         print "yep, valid proxy"
26 |     else:
27 |         print  "sorry, invalid proxy"
28 |     end_time = time.time()
29 |     print end_time-start_time
30 | VerifyIp()
31 | 
32 | # url = 'http://42.96.192.46/echo_ip'
33 | # loc_ip = urllib.urlopen(url).read().strip()
34 | # print loc_ip
35 | 
36 | def _19lou_proxy_request():
37 |     path = "/home/huafeng/PycharmProjects/spider/19lou/page_url"
38 |     filename = os.path.join(path, 'whole_page_urls')
39 | 
40 |     proxy = {'domain':"5.45.67.109", 'port':"3128"}
41 |     http_proxy = 'http://%s:%s/' % (proxy['domain'], proxy['port'])
42 |     proxy_support = urllib2.ProxyHandler({'http':http_proxy})
43 |     opener = urllib2.build_opener(proxy_support)
44 |     urllib2.install_opener(opener)
45 | 
46 |     with open(filename) as f:
47 |         # for url in f.readlines():
48 |         for url in  [item.strip() for item in f.readlines()]:
49 |             req = urllib2.Request(url)#, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'})
50 |             try:
51 |                 html = urllib2.urlopen(req, timeout=5).read().decode('gbk')
52 |                 print html
53 |                 # time.sleep(5)
54 |             except Exception, e:
55 |                 continue
56 |             print url
57 | 
58 | # _19lou_proxy_request()


--------------------------------------------------------------------------------
/dangdang/computer/sys/computer_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1043432305.html
 2 | http://product.dangdang.com/1057908922.html
 3 | http://product.dangdang.com/1135115605.html
 4 | http://product.dangdang.com/1058054122.html
 5 | http://product.dangdang.com/1184350622.html
 6 | http://product.dangdang.com/1232175806.html
 7 | http://product.dangdang.com/1446446708.html
 8 | http://product.dangdang.com/60571738.html
 9 | http://product.dangdang.com/1168951005.html
10 | http://product.dangdang.com/60561695.html
11 | http://product.dangdang.com/1077089205.html
12 | http://product.dangdang.com/60563336.html
13 | http://product.dangdang.com/1154947821.html
14 | http://product.dangdang.com/1231079505.html
15 | http://product.dangdang.com/1088160507.html
16 | http://product.dangdang.com/1201605606.html
17 | http://product.dangdang.com/1203941206.html
18 | http://product.dangdang.com/1156170005.html
19 | http://product.dangdang.com/1068087406.html
20 | http://product.dangdang.com/1379089906.html
21 | http://product.dangdang.com/1011270012.html
22 | http://product.dangdang.com/60569129.html
23 | http://product.dangdang.com/1052354821.html
24 | http://product.dangdang.com/1115249321.html
25 | http://product.dangdang.com/1332081006.html
26 | http://product.dangdang.com/60559033.html
27 | http://product.dangdang.com/1151675207.html
28 | http://product.dangdang.com/1081070005.html
29 | http://product.dangdang.com/1305108905.html
30 | http://product.dangdang.com/1249138106.html
31 | http://product.dangdang.com/1249139706.html
32 | http://product.dangdang.com/1384650106.html
33 | http://product.dangdang.com/1092327506.html
34 | http://product.dangdang.com/1192917502.html
35 | http://product.dangdang.com/1266268505.html
36 | http://product.dangdang.com/1155295921.html
37 | http://product.dangdang.com/1069361805.html
38 | http://product.dangdang.com/1178970822.html
39 | http://product.dangdang.com/60552193.html
40 | http://product.dangdang.com/60556490.html
41 | http://product.dangdang.com/1313963107.html
42 | http://product.dangdang.com/1271342208.html
43 | http://product.dangdang.com/1228122905.html
44 | http://product.dangdang.com/1229043505.html
45 | http://product.dangdang.com/60547556.html
46 | http://product.dangdang.com/1270651308.html
47 | http://product.dangdang.com/1143044111.html
48 | http://product.dangdang.com/1210099205.html
49 | 


--------------------------------------------------------------------------------
/dangdang/food/sys/food_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1292836808.html
 2 | http://product.dangdang.com/1247826808.html
 3 | http://product.dangdang.com/1033414421.html
 4 | http://product.dangdang.com/1457668908.html
 5 | http://product.dangdang.com/1027733106.html
 6 | http://product.dangdang.com/1392218408.html
 7 | http://product.dangdang.com/1247803708.html
 8 | http://product.dangdang.com/1051491122.html
 9 | http://product.dangdang.com/1045264621.html
10 | http://product.dangdang.com/1462039808.html
11 | http://product.dangdang.com/1247812508.html
12 | http://product.dangdang.com/1247825908.html
13 | http://product.dangdang.com/1462330308.html
14 | http://product.dangdang.com/1277452208.html
15 | http://product.dangdang.com/1277491008.html
16 | http://product.dangdang.com/1473498908.html
17 | http://product.dangdang.com/1262367108.html
18 | http://product.dangdang.com/1292817108.html
19 | http://product.dangdang.com/1027931006.html
20 | http://product.dangdang.com/1280534508.html
21 | http://product.dangdang.com/1262351508.html
22 | http://product.dangdang.com/1247816108.html
23 | http://product.dangdang.com/1380812808.html
24 | http://product.dangdang.com/1247828408.html
25 | http://product.dangdang.com/1413505808.html
26 | http://product.dangdang.com/1247804508.html
27 | http://product.dangdang.com/1380839408.html
28 | http://product.dangdang.com/1473475608.html
29 | http://product.dangdang.com/1247809008.html
30 | http://product.dangdang.com/1473532208.html
31 | http://product.dangdang.com/1473472908.html
32 | http://product.dangdang.com/1108626206.html
33 | http://product.dangdang.com/1473512308.html
34 | http://product.dangdang.com/1413746308.html
35 | http://product.dangdang.com/1462054008.html
36 | http://product.dangdang.com/1106580321.html
37 | http://product.dangdang.com/1296000008.html
38 | http://product.dangdang.com/1277475908.html
39 | http://product.dangdang.com/1381156408.html
40 | http://product.dangdang.com/1247812308.html
41 | http://product.dangdang.com/1277491408.html
42 | http://product.dangdang.com/1247807008.html
43 | http://product.dangdang.com/1247827808.html
44 | http://product.dangdang.com/1462112708.html
45 | http://product.dangdang.com/1463620608.html
46 | http://product.dangdang.com/1247819308.html
47 | http://product.dangdang.com/1461936808.html
48 | http://product.dangdang.com/1247818408.html
49 | 


--------------------------------------------------------------------------------
/jd/read_failed_page_url.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import codecs
 7 | import urllib2
 8 | 
 9 | end_url_with_pagesize = True
10 | ip_port = '218.207.195.206:80'
11 | PATH = r'/mnt/data/spiders/jd/health'
12 | ITEM_FILENAME = 'freshfood_item_id'
13 | 
14 | def get_whole_item_id():
15 |     http_handler = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
16 |     opener = urllib2.build_opener(http_handler)
17 |     urllib2.install_opener(opener)
18 |     first_failed_url_filename = os.path.join(PATH, 'log', 'failed_item_id')
19 |     crawled_page_url_filename = os.path.join(PATH, 'log', 're_crawled_page_url')
20 |     failed_page_url_filename = os.path.join(PATH, 'log', 're_failed_page_url')
21 |     item_id_filename = os.path.join(PATH, 'page_url', ITEM_FILENAME)
22 |     with codecs.open(first_failed_url_filename, encoding='utf-8') as first_failed_url_f,\
23 |     codecs.open(crawled_page_url_filename, mode='wb', encoding='utf-8') as crawled_page_wf, \
24 |     codecs.open(failed_page_url_filename, mode='wb', encoding='utf-8') as failed_page_url_wf, \
25 |     codecs.open(item_id_filename, mode='a', encoding='utf-8') as item_id_wf:
26 |         for page_url in [item.split(';')[1].strip() for item in first_failed_url_f.readlines()]:
27 |             try:
28 |                 html = urllib2.urlopen(page_url.strip(), timeout=15).read()
29 |             except:
30 |                 try:
31 |                     html = urllib2.urlopen(page_url.strip(), timeout=15).read()
32 |                 except:
33 |                     failed_page_url_wf.write('timeout in url;%s'%page_url)
34 |                     continue
35 |             if end_url_with_pagesize:
36 |                 item_id_list = re.findall(r'''sku=\\"(\d+)\\" selfservice''', html)
37 |             else:
38 |                 item_id_list = re.findall(r"sku='(\d+)'><div class=", html)
39 |             if not item_id_list:
40 |                 failed_page_url_wf.write('not match id_str in url;%s'%page_url)
41 |                 continue
42 |             temp_list_for_write = [item+'\n' for item in item_id_list]
43 |             # item_id_wf.writelines(temp_list_for_write)
44 |             # crawled_page_wf.write(page_url)
45 |             print temp_list_for_write
46 |             time.sleep(3)
47 | get_whole_item_id()


--------------------------------------------------------------------------------
/dangdang/baby_kid/kid/sys/kid_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1435159408.html
 2 | http://product.dangdang.com/1373259608.html
 3 | http://product.dangdang.com/1213144322.html
 4 | http://product.dangdang.com/1428223108.html
 5 | http://product.dangdang.com/1301491906.html
 6 | http://product.dangdang.com/1396747808.html
 7 | http://product.dangdang.com/1382369608.html
 8 | http://product.dangdang.com/1076092512.html
 9 | http://product.dangdang.com/1444792208.html
10 | http://product.dangdang.com/1297388708.html
11 | http://product.dangdang.com/1221985722.html
12 | http://product.dangdang.com/1225515022.html
13 | http://product.dangdang.com/1221990822.html
14 | http://product.dangdang.com/1179528722.html
15 | http://product.dangdang.com/1307992106.html
16 | http://product.dangdang.com/1124066808.html
17 | http://product.dangdang.com/1085410406.html
18 | http://product.dangdang.com/1217851822.html
19 | http://product.dangdang.com/1229444222.html
20 | http://product.dangdang.com/1148672012.html
21 | http://product.dangdang.com/410019556.html
22 | http://product.dangdang.com/1282125106.html
23 | http://product.dangdang.com/1455257808.html
24 | http://product.dangdang.com/1381311608.html
25 | http://product.dangdang.com/1082590106.html
26 | http://product.dangdang.com/1449050508.html
27 | http://product.dangdang.com/1194833722.html
28 | http://product.dangdang.com/1379710306.html
29 | http://product.dangdang.com/410017791.html
30 | http://product.dangdang.com/410017959.html
31 | http://product.dangdang.com/1371418502.html
32 | http://product.dangdang.com/1354927506.html
33 | http://product.dangdang.com/1293280805.html
34 | http://product.dangdang.com/1296754605.html
35 | http://product.dangdang.com/1150529212.html
36 | http://product.dangdang.com/1293196505.html
37 | http://product.dangdang.com/1154314212.html
38 | http://product.dangdang.com/1190518522.html
39 | http://product.dangdang.com/1161322612.html
40 | http://product.dangdang.com/1301506706.html
41 | http://product.dangdang.com/1291339807.html
42 | http://product.dangdang.com/1159482712.html
43 | http://product.dangdang.com/1293200805.html
44 | http://product.dangdang.com/1076067512.html
45 | http://product.dangdang.com/1223641022.html
46 | http://product.dangdang.com/1318093807.html
47 | http://product.dangdang.com/1256096307.html
48 | http://product.dangdang.com/1194954211.html
49 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/living/sys/living_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1209075722.html
 2 | http://product.dangdang.com/1040595106.html
 3 | http://product.dangdang.com/1326728807.html
 4 | http://product.dangdang.com/1074710622.html
 5 | http://product.dangdang.com/1031846206.html
 6 | http://product.dangdang.com/60563840.html
 7 | http://product.dangdang.com/1087200522.html
 8 | http://product.dangdang.com/1245879906.html
 9 | http://product.dangdang.com/1074715022.html
10 | http://product.dangdang.com/1040606606.html
11 | http://product.dangdang.com/1074720922.html
12 | http://product.dangdang.com/1040596306.html
13 | http://product.dangdang.com/1208601407.html
14 | http://product.dangdang.com/1296590107.html
15 | http://product.dangdang.com/1378511908.html
16 | http://product.dangdang.com/1031410606.html
17 | http://product.dangdang.com/1378636308.html
18 | http://product.dangdang.com/1378545608.html
19 | http://product.dangdang.com/1378684908.html
20 | http://product.dangdang.com/1297556807.html
21 | http://product.dangdang.com/1456782608.html
22 | http://product.dangdang.com/1207167907.html
23 | http://product.dangdang.com/1040604406.html
24 | http://product.dangdang.com/410251344.html
25 | http://product.dangdang.com/1292340807.html
26 | http://product.dangdang.com/1333244006.html
27 | http://product.dangdang.com/1378536408.html
28 | http://product.dangdang.com/1378659108.html
29 | http://product.dangdang.com/1378698908.html
30 | http://product.dangdang.com/1378558008.html
31 | http://product.dangdang.com/1378689708.html
32 | http://product.dangdang.com/1309650307.html
33 | http://product.dangdang.com/1033331106.html
34 | http://product.dangdang.com/1298507206.html
35 | http://product.dangdang.com/1093434422.html
36 | http://product.dangdang.com/1378675108.html
37 | http://product.dangdang.com/1378531008.html
38 | http://product.dangdang.com/1229415122.html
39 | http://product.dangdang.com/1293971505.html
40 | http://product.dangdang.com/410251342.html
41 | http://product.dangdang.com/60272207.html
42 | http://product.dangdang.com/60264251.html
43 | http://product.dangdang.com/1245762906.html
44 | http://product.dangdang.com/1265456506.html
45 | http://product.dangdang.com/410251186.html
46 | http://product.dangdang.com/60563841.html
47 | http://product.dangdang.com/1291612107.html
48 | http://product.dangdang.com/1456771908.html
49 | 


--------------------------------------------------------------------------------
/dangdang/beauty_adult/adult/sys/adult_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1300766708.html
 2 | http://product.dangdang.com/1262439205.html
 3 | http://product.dangdang.com/1300811708.html
 4 | http://product.dangdang.com/1217809806.html
 5 | http://product.dangdang.com/1164369811.html
 6 | http://product.dangdang.com/1300774508.html
 7 | http://product.dangdang.com/1300729908.html
 8 | http://product.dangdang.com/1043063508.html
 9 | http://product.dangdang.com/1134177621.html
10 | http://product.dangdang.com/1471728301.html
11 | http://product.dangdang.com/1147670301.html
12 | http://product.dangdang.com/1043091008.html
13 | http://product.dangdang.com/1035360906.html
14 | http://product.dangdang.com/1131230621.html
15 | http://product.dangdang.com/1088187406.html
16 | http://product.dangdang.com/1332967606.html
17 | http://product.dangdang.com/1043154012.html
18 | http://product.dangdang.com/1165649811.html
19 | http://product.dangdang.com/1164370111.html
20 | http://product.dangdang.com/1300784108.html
21 | http://product.dangdang.com/1066033806.html
22 | http://product.dangdang.com/1389907102.html
23 | http://product.dangdang.com/1241614202.html
24 | http://product.dangdang.com/1132865612.html
25 | http://product.dangdang.com/1049550412.html
26 | http://product.dangdang.com/1066034506.html
27 | http://product.dangdang.com/1070724512.html
28 | http://product.dangdang.com/1262439005.html
29 | http://product.dangdang.com/1122839921.html
30 | http://product.dangdang.com/1053568112.html
31 | http://product.dangdang.com/1283924001.html
32 | http://product.dangdang.com/1090847112.html
33 | http://product.dangdang.com/1300798308.html
34 | http://product.dangdang.com/1066034406.html
35 | http://product.dangdang.com/1148181605.html
36 | http://product.dangdang.com/1132865412.html
37 | http://product.dangdang.com/1164371811.html
38 | http://product.dangdang.com/1046218812.html
39 | http://product.dangdang.com/1053104112.html
40 | http://product.dangdang.com/1016472512.html
41 | http://product.dangdang.com/1300816108.html
42 | http://product.dangdang.com/1300797008.html
43 | http://product.dangdang.com/1262439105.html
44 | http://product.dangdang.com/1245770008.html
45 | http://product.dangdang.com/1168229201.html
46 | http://product.dangdang.com/1061937522.html
47 | http://product.dangdang.com/1283974301.html
48 | http://product.dangdang.com/1142358807.html
49 | 


--------------------------------------------------------------------------------
/dangdang/beauty_adult/beauty/sys/beauty_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1340739002.html
 2 | http://product.dangdang.com/1017868311.html
 3 | http://product.dangdang.com/1034037511.html
 4 | http://product.dangdang.com/1225377622.html
 5 | http://product.dangdang.com/1010612622.html
 6 | http://product.dangdang.com/1003882321.html
 7 | http://product.dangdang.com/1159275307.html
 8 | http://product.dangdang.com/1340878002.html
 9 | http://product.dangdang.com/60439346.html
10 | http://product.dangdang.com/60329196.html
11 | http://product.dangdang.com/1056424022.html
12 | http://product.dangdang.com/1069151122.html
13 | http://product.dangdang.com/1105608622.html
14 | http://product.dangdang.com/1469671508.html
15 | http://product.dangdang.com/1341745702.html
16 | http://product.dangdang.com/60030278.html
17 | http://product.dangdang.com/1455264201.html
18 | http://product.dangdang.com/1340876302.html
19 | http://product.dangdang.com/1014881107.html
20 | http://product.dangdang.com/1141888507.html
21 | http://product.dangdang.com/1152565222.html
22 | http://product.dangdang.com/1055627207.html
23 | http://product.dangdang.com/1031719721.html
24 | http://product.dangdang.com/1073086207.html
25 | http://product.dangdang.com/1022693908.html
26 | http://product.dangdang.com/1286797708.html
27 | http://product.dangdang.com/1073683107.html
28 | http://product.dangdang.com/1136384808.html
29 | http://product.dangdang.com/1356167908.html
30 | http://product.dangdang.com/1017321606.html
31 | http://product.dangdang.com/1233264608.html
32 | http://product.dangdang.com/1114607311.html
33 | http://product.dangdang.com/1119888521.html
34 | http://product.dangdang.com/1197455906.html
35 | http://product.dangdang.com/1073429107.html
36 | http://product.dangdang.com/1015559107.html
37 | http://product.dangdang.com/1022993108.html
38 | http://product.dangdang.com/1022398008.html
39 | http://product.dangdang.com/1264651106.html
40 | http://product.dangdang.com/1109058722.html
41 | http://product.dangdang.com/1229877806.html
42 | http://product.dangdang.com/1276947305.html
43 | http://product.dangdang.com/1280579108.html
44 | http://product.dangdang.com/1178184008.html
45 | http://product.dangdang.com/1237773601.html
46 | http://product.dangdang.com/1031301007.html
47 | http://product.dangdang.com/1338754602.html
48 | http://product.dangdang.com/1340996002.html
49 | 


--------------------------------------------------------------------------------
/dangdang/living_auto/auto/sys/auto_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1084910312.html
 2 | http://product.dangdang.com/1025782406.html
 3 | http://product.dangdang.com/1111204412.html
 4 | http://product.dangdang.com/1302741805.html
 5 | http://product.dangdang.com/1095558412.html
 6 | http://product.dangdang.com/1084993912.html
 7 | http://product.dangdang.com/1078150907.html
 8 | http://product.dangdang.com/1155786812.html
 9 | http://product.dangdang.com/1111239412.html
10 | http://product.dangdang.com/1296384007.html
11 | http://product.dangdang.com/1193878106.html
12 | http://product.dangdang.com/1096475412.html
13 | http://product.dangdang.com/1183745906.html
14 | http://product.dangdang.com/1374057002.html
15 | http://product.dangdang.com/1151934106.html
16 | http://product.dangdang.com/1469729501.html
17 | http://product.dangdang.com/1146091708.html
18 | http://product.dangdang.com/1074921706.html
19 | http://product.dangdang.com/1057624906.html
20 | http://product.dangdang.com/1378140506.html
21 | http://product.dangdang.com/1011962421.html
22 | http://product.dangdang.com/1275974106.html
23 | http://product.dangdang.com/1085864112.html
24 | http://product.dangdang.com/1378392406.html
25 | http://product.dangdang.com/1084904012.html
26 | http://product.dangdang.com/1132971512.html
27 | http://product.dangdang.com/1305082807.html
28 | http://product.dangdang.com/1148689712.html
29 | http://product.dangdang.com/1206100706.html
30 | http://product.dangdang.com/1117976012.html
31 | http://product.dangdang.com/1130937412.html
32 | http://product.dangdang.com/1093961708.html
33 | http://product.dangdang.com/1146354712.html
34 | http://product.dangdang.com/1264575606.html
35 | http://product.dangdang.com/1099939206.html
36 | http://product.dangdang.com/1044003322.html
37 | http://product.dangdang.com/1007353521.html
38 | http://product.dangdang.com/1145015005.html
39 | http://product.dangdang.com/1317633106.html
40 | http://product.dangdang.com/1058059406.html
41 | http://product.dangdang.com/1122742802.html
42 | http://product.dangdang.com/1189665806.html
43 | http://product.dangdang.com/1183229207.html
44 | http://product.dangdang.com/1050625907.html
45 | http://product.dangdang.com/1050619707.html
46 | http://product.dangdang.com/1136636411.html
47 | http://product.dangdang.com/1189219206.html
48 | http://product.dangdang.com/1085868612.html
49 | 


--------------------------------------------------------------------------------
/dangdang/mobile_digital/digital/sys/digital_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1149770812.html
 2 | http://product.dangdang.com/1171891421.html
 3 | http://product.dangdang.com/1110724912.html
 4 | http://product.dangdang.com/1151779012.html
 5 | http://product.dangdang.com/1228452622.html
 6 | http://product.dangdang.com/1155390221.html
 7 | http://product.dangdang.com/1311362205.html
 8 | http://product.dangdang.com/1328064106.html
 9 | http://product.dangdang.com/1245567506.html
10 | http://product.dangdang.com/1101722421.html
11 | http://product.dangdang.com/1149095007.html
12 | http://product.dangdang.com/1149251512.html
13 | http://product.dangdang.com/1274165505.html
14 | http://product.dangdang.com/1228781822.html
15 | http://product.dangdang.com/1133697612.html
16 | http://product.dangdang.com/1290730006.html
17 | http://product.dangdang.com/1157502212.html
18 | http://product.dangdang.com/1108049422.html
19 | http://product.dangdang.com/1158826122.html
20 | http://product.dangdang.com/1145044812.html
21 | http://product.dangdang.com/1118924821.html
22 | http://product.dangdang.com/1134548321.html
23 | http://product.dangdang.com/1023781821.html
24 | http://product.dangdang.com/1219240122.html
25 | http://product.dangdang.com/1279533607.html
26 | http://product.dangdang.com/60557084.html
27 | http://product.dangdang.com/1110303105.html
28 | http://product.dangdang.com/1241074807.html
29 | http://product.dangdang.com/1139817612.html
30 | http://product.dangdang.com/1105818812.html
31 | http://product.dangdang.com/1101724321.html
32 | http://product.dangdang.com/1103637822.html
33 | http://product.dangdang.com/1142619712.html
34 | http://product.dangdang.com/1181760407.html
35 | http://product.dangdang.com/1136852612.html
36 | http://product.dangdang.com/1009923812.html
37 | http://product.dangdang.com/1043517805.html
38 | http://product.dangdang.com/60567110.html
39 | http://product.dangdang.com/1120051405.html
40 | http://product.dangdang.com/1240131106.html
41 | http://product.dangdang.com/1128357912.html
42 | http://product.dangdang.com/1151770721.html
43 | http://product.dangdang.com/1265624807.html
44 | http://product.dangdang.com/1110469105.html
45 | http://product.dangdang.com/1183363522.html
46 | http://product.dangdang.com/1174509507.html
47 | http://product.dangdang.com/1311446705.html
48 | http://product.dangdang.com/1131096321.html
49 | 


--------------------------------------------------------------------------------
/baidu/out/30:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/30.htm", "header": "it", "content": "\u539f\u610f\u5728\u82f1\u8bed\u4e2d\u6307\u4ee3\u7269\u7684\u7b2c\u4e09\u4eba\u79f0\u5355\u6570\u53e6\u6709\u5176\u4ed6\u5355\u8bcd\u7684\u7f29\u5199\u4f8b\u5982\u4fe1\u606f\u6280\u672fInformation Technology\u5373\u65f6\u7ffb\u8bd1 instant translation\u521b\u65b0\u6280\u672finnovative technology\u7b49itpron.1.\u6307\u5df2\u63d0\u53ca\u6216\u5fc3\u76ee\u4e2d\u7684\u4eba\u6216\u4e8b\u7269\u8fd9\u90a3\u5b832.\u6307\u65e0\u751f\u547d\u7269\u52a8\u690d\u7269\u6027\u522b\u4e0d\u8be6\u7684\u5e7c\u5b69\u7b49\u5b833.\u4f5c\u65e0\u4eba\u79f0\u52a8\u8bcd\u7684\u4e3b\u8bed\u8868\u793a\u65f6\u95f4\u6c14\u5019\u8ddd\u79bb\u7b494.\u7f6e\u4e8e\u53e5\u9996\u6216\u53e5\u4e2d\u5f15\u5bfc\u540e\u9762\u7684\u77ed\u8bed\u6216\u4ece\u53e55.\u4f5c\u4e3a\u5f62\u5f0f\u4e0a\u7684\u4e3b\u8bed\u6216\u5bbe\u8bed\u7528\u4e8e\u8868\u793a\u5f3a\u8c03\u7684\u53e5\u578b\u4e2d6.\u7528\u4e8e\u67d0\u4e9b\u52a8\u8bcd\u4ecb\u8bcd\u540e\u9762\u8bcd\u4e49\u542b\u7cca\u6784\u6210\u4e60\u60ef\u8bedn.[U]1.\u6349\u8ff7\u85cf\u7b49\u6e38\u620f\u6349\u4eba\u8005\u731c\u7684\u4eba2.\u53e3\u4ec5\u7528\u4e8eginandit\u610f\u5927\u5229\u82e6\u827e\u91523.\u53e3\u5173\u952e\u91cd\u8981\u65f6\u523b4.\u53e3\u6027\u611f\u6027\u4ea45.\u7b28\u86cbthingn.\u4e8b\u4ef6\u5f62\u52bf\u4e1c\u897f\u4e8b\u7269\u5bb6\u4f19\u4e8b\u4e1a\u590d\u6570\u5f62\u5f0fthingsIT = Information Technology\u4fe1\u606f\u6280\u672fIT = internet Technology\u4e92\u8054\u7f51\u6280\u672fIT = International Tolerance\u6807\u51c6\u516c\u5dee\u56fd\u9645\u5141\u8bb8\u6b8b\u7559\u91cfIT = information theory\u4fe1\u606f\u8bbaIT = instant translation\u5373\u65f6\u7ffb\u8bd1IT = initial test\u521d\u59cb\u6d4b\u8bd5IT = integration testing\u96c6\u6210\u6d4b\u8bd5IT = income tax\u6536\u5165\u6240\u5f97\u7a0eIT = Imperial Tabacco\u5e1d\u56fd\u70df\u8349\u96c6\u56e2IT = inclusive tour\u5305\u529e\u65c5\u884cIT = individual therapy\u4e2a\u522b\u6cbb\u7597IT = innovative technology\u521b\u65b0\u6280\u672fIT = inspiration technology\u6fc0\u53d1\u7075\u611f\u7684\u79d1\u6280IT= International trade\u56fd\u9645\u8d38\u6613IT= intertuberal diameter\u5750\u9aa8\u7ed3\u8282\u95f4\u5f84\u53c8\u79f0\u51fa\u53e3\u6a2a\u5f84transverse outletTOIT= \u610f\u5927\u5229\u56fd\u9645\u57df\u540d\u7f29\u5199\u4e5f\u662f\u6574\u4e2a\u8ba1\u7b97\u673a\u884c\u4e1a\u5de5\u4f5c\u8005\u7684\u804c\u4e1a\u6216\u8005\u4f01\u4e1a\u8ba1\u7b97\u673a\u4ea7\u54c1\u603b\u79f0"}


--------------------------------------------------------------------------------
/proxy/tst_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import time
 7 | import codecs
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | def Varify_proxy(ip_port):
13 |     url = "http://42.96.192.46/echo_ip"
14 |     http_proxy = 'http://%s'%ip_port
15 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
16 |     opener = urllib2.build_opener(proxy_hanlder)
17 |     urllib2.install_opener(opener)
18 |     try:
19 |         ip = urllib2.urlopen(url, timeout=10).read().strip()
20 |         if ip == ip_port.split(':')[0]:
21 |             print 'successed ip : %s'%ip
22 |             return True
23 |         else:
24 |             print 'failed ip : %s'%ip
25 |             return False
26 |     except:
27 |         print 'timeout request...ip:%s'%ip_port
28 |         return False
29 | # Varify_proxy('14.1.43.82:80')
30 | 
31 | def gen_proxy_ip():
32 |     start_time = time.time()
33 |     url = "http://www.youdaili.cn/Daili/http/2013.html"
34 |     html = urllib2.urlopen(url).read().decode('utf-8')
35 |     soup = BeautifulSoup(html)
36 |     div_level = soup.find('div', 'cont_font')
37 |     proxy_text = div_level.p.text
38 |     print len(proxy_text)
39 | 
40 |     # html = '''1.179.147.2:8080@HTTP#泰国
41 |     #         1.230.127.54:8080@HTTP#韩国
42 |     #         5.101.130.95:80@HTTP#俄罗斯
43 |     #         5.223.112.253:8080@HTTP#伊朗
44 |     #         14.1.43.82:80@HTTP#新西兰
45 |     #         14.18.17.166:80@HTTP#广东省广州市 上海网宿科技股份有限公司电信CDN节点
46 |     #         14.18.242.147:8080@HTTP#广东省广州市 电信'''
47 |     #
48 |     # ip_port_list = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:[\d]+', html)
49 |     # # print ip_port_list, len(ip_port_list)
50 |     #
51 |     # for ip_port in ip_port_list:
52 |     #     Varify_proxy(ip_port)
53 |         # splited_ip_port = ip_port.split(':')
54 |         # assert len(splited_ip_port) is 2
55 |         # print splited_ip_port
56 |         # proxy_dic['ip'] = splited_ip_port[0]
57 |         # proxy_dic['prot'] = splited_ip_port[1]
58 | 
59 | 
60 |     # filename = os.path.join(PATH, 'page_url', 'proxy_ip_port')
61 |     # ip_port_list = [item+'\n' for item in ip_port_list]
62 |     # with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
63 |     #     wf.writelines(ip_port_list)
64 |     end_time = time.time()
65 |     print end_time - start_time
66 | gen_proxy_ip()


--------------------------------------------------------------------------------
/baidu/xici_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import time
 7 | import codecs
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | def request_in_baidu(ip_port):
13 |     start_time = time.time()
14 |     url = 'http://baike.baidu.com/view/3570272.htm'
15 |     http_proxy = 'http://%s'%ip_port
16 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
17 |     opener = urllib2.build_opener(proxy_hanlder)
18 |     urllib2.install_opener(opener)
19 | 
20 |     try:
21 |         html = urllib2.urlopen(url, timeout=10)
22 |         soup = BeautifulSoup(html)
23 |         para_level_list = soup.find_all('div', class_='para')
24 |         end_time = time.time()
25 |         interval = end_time - start_time
26 |         if para_level_list and interval < 5:
27 |             print 'success ip_port:%s'%ip_port
28 |             return True
29 |     except:
30 |         # print 'timed item_id...in ip_port:%s'%ip_port
31 |         return
32 | 
33 | def gen_proxy():
34 |     url_pattern = "http://www.xici.net.co/nn/%s"
35 |     url_list = [url_pattern%str(i) for i in range(1,6)]
36 |     http_proxy_list = []
37 |     for url in url_list:
38 |         try:
39 |             html = urllib2.urlopen(url, timeout=15).read()
40 |         except:
41 |             time.sleep(120)
42 |             try:
43 |                 html = urllib2.urlopen(url, timeout=15).read()
44 |             except:
45 |                 continue
46 |         soup = BeautifulSoup(html)
47 |         tr_level_list = soup.find_all('tr')
48 |         td_level_list = [item.find_all('td') for item in tr_level_list]
49 |         ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
50 |         # print ip_port_type_list, len(ip_port_type_list)
51 |         matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
52 |         # print matched_ip_port_list, len(matched_ip_port_list)
53 |         com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
54 |         http_proxy_list.extend(com_str_list)
55 | 
56 |     #过滤已被百度查封的IP
57 |     com_str_list = [item for item in http_proxy_list if request_in_baidu(item)]
58 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
59 |     with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
60 |         com_str_list = [item+'\n' for item in com_str_list]
61 |         wf.writelines(com_str_list)
62 | gen_proxy()
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/jd/varify_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import codecs
 7 | import time
 8 | import random
 9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 | 
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 | 
14 | def varify(ip_port):
15 |     url = 'http://item.jd.com/1013330.html'
16 |     proxy_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
17 |     opener = urllib2.build_opener(proxy_hanlder)
18 |     urllib2.install_opener(opener)
19 |     try:
20 |         start_time = time.time()
21 |         html = urllib2.urlopen(url, timeout=10).read()
22 |         soup = BeautifulSoup(html)
23 |         content = soup.find('div', id='product-detail-1')
24 |         end_time = time.time()
25 |         time_consume = end_time - start_time
26 |         if not content:
27 |             print 'invalid ip_port:%s'%ip_port
28 |             return
29 |         elif content and time_consume < 1:
30 |             print 'success ip_port:%s'%ip_port
31 |             print end_time - start_time
32 |             return True
33 |     except:
34 |         print 'timed item_id...in ip_port:%s'%ip_port
35 |         return
36 | # varify('')
37 | 
38 | def write_proxy_into_file(http_proxy_list):
39 |     com_str_list = [item+'\n' for item in http_proxy_list if varify(item)]
40 |     filename = os.path.join(PATH, 'xici_proxy')
41 |     with codecs.open(filename, mode='a', encoding='utf-8') as wf:
42 |         wf.writelines(com_str_list)
43 | 
44 | def gen_proxy():
45 |     url_pattern = "http://www.xici.net.co/nn/%s"
46 |     url_list = [url_pattern%str(i) for i in range(1,6)]
47 |     http_proxy_list = []
48 |     for url in url_list:
49 |         try:
50 |             html = urllib2.urlopen(url, timeout=15).read()
51 |         except:
52 |             time.sleep(60)
53 |             try:
54 |                 html = urllib2.urlopen(url, timeout=15).read()
55 |             except:
56 |                 continue
57 |         soup = BeautifulSoup(html)
58 |         tr_level_list = soup.find_all('tr')
59 |         td_level_list = [item.find_all('td') for item in tr_level_list]
60 |         ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 |         matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
62 |         com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
63 |         http_proxy_list.extend(com_str_list)
64 |     write_proxy_into_file(http_proxy_list)
65 | if __name__ == "__main__":
66 |     gen_proxy()
67 | 
68 | 


--------------------------------------------------------------------------------
/proxy/xici.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import codecs
 7 | import time
 8 | import random
 9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 | 
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 | 
14 | def request_in_douban(ip_port):
15 |     '''过滤豆瓣被封的IP'''
16 |     url = 'http://movie.douban.com/subject/6786002/'
17 |     http_proxy = 'http://%s'%ip_port
18 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
19 |     opener = urllib2.build_opener(proxy_hanlder)
20 |     urllib2.install_opener(opener)
21 |     try:
22 |         html = urllib2.urlopen(url, timeout=15)
23 |         soup = BeautifulSoup(html)
24 | 
25 |         title = soup.find('span', property='v:itemreviewed')
26 | 
27 |         if not title:
28 |             print 'invalid ip_port:%s'%ip_port
29 |             return
30 |         else:
31 |             print 'success ip_port:%s'%ip_port
32 |             return True
33 |     except:
34 |         print 'timed item_id...in ip_port:%s'%ip_port
35 |         return
36 | 
37 | def write_proxy_into_file(http_proxy_list):
38 |     #过滤已被豆瓣查封的IP
39 |     com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)]
40 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
41 |     with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
42 |         wf.writelines(com_str_list)
43 | 
44 | def gen_proxy():
45 |     url_pattern = "http://www.xici.net.co/nn/%s"
46 |     url_list = [url_pattern%str(i) for i in range(1,4)]
47 |     http_proxy_list = []
48 |     for url in url_list:
49 |         try:
50 |             html = urllib2.urlopen(url, timeout=15).read()
51 |         except:
52 |             time.sleep(60)
53 |             try:
54 |                 html = urllib2.urlopen(url, timeout=15).read()
55 |             except:
56 |                 continue
57 |         soup = BeautifulSoup(html)
58 |         tr_level_list = soup.find_all('tr')
59 |         td_level_list = [item.find_all('td') for item in tr_level_list]
60 |         ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 |         # print ip_port_type_list, len(ip_port_type_list)
62 |         matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
63 |         # print matched_ip_port_list, len(matched_ip_port_list)
64 |         com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
65 |         http_proxy_list.extend(com_str_list)
66 | 
67 |     write_proxy_into_file(http_proxy_list)
68 |     # time.sleep(SLEEP_INTERVAL)
69 | 
70 | gen_proxy()
71 | 
72 | 


--------------------------------------------------------------------------------
/douban/xici_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import codecs
 7 | import time
 8 | import random
 9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 | 
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 | 
14 | def request_in_douban(ip_port):
15 |     '''过滤豆瓣被封的IP'''
16 |     url = 'http://movie.douban.com/subject/6786002/'
17 |     http_proxy = 'http://%s'%ip_port
18 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
19 |     opener = urllib2.build_opener(proxy_hanlder)
20 |     urllib2.install_opener(opener)
21 |     try:
22 |         html = urllib2.urlopen(url, timeout=10)
23 |         soup = BeautifulSoup(html)
24 | 
25 |         title = soup.find('span', property='v:itemreviewed')
26 | 
27 |         if not title:
28 |             print 'invalid ip_port:%s'%ip_port
29 |             return
30 |         else:
31 |             print 'success ip_port:%s'%ip_port
32 |             return True
33 |     except:
34 |         print 'timed item_id...in ip_port:%s'%ip_port
35 |         return
36 | 
37 | def write_proxy_into_file(http_proxy_list):
38 |     #过滤已被豆瓣查封的IP
39 |     com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)]
40 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
41 |     with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
42 |         wf.writelines(com_str_list)
43 | 
44 | def gen_proxy():
45 |     url_pattern = "http://www.xici.net.co/nn/%s"
46 |     url_list = [url_pattern%str(i) for i in range(1,6)]
47 |     http_proxy_list = []
48 |     for url in url_list:
49 |         try:
50 |             html = urllib2.urlopen(url, timeout=15).read()
51 |         except:
52 |             time.sleep(60)
53 |             try:
54 |                 html = urllib2.urlopen(url, timeout=15).read()
55 |             except:
56 |                 continue
57 |         soup = BeautifulSoup(html)
58 |         tr_level_list = soup.find_all('tr')
59 |         td_level_list = [item.find_all('td') for item in tr_level_list]
60 |         ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 |         # print ip_port_type_list, len(ip_port_type_list)
62 |         matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
63 |         # print matched_ip_port_list, len(matched_ip_port_list)
64 |         com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
65 |         http_proxy_list.extend(com_str_list)
66 | 
67 |     write_proxy_into_file(http_proxy_list)
68 |     # time.sleep(SLEEP_INTERVAL)
69 | 
70 | gen_proxy()
71 | 
72 | 


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/jewellery/sys/jewellery_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/60545118.html
 2 | http://product.dangdang.com/60329284.html
 3 | http://product.dangdang.com/60561956.html
 4 | http://product.dangdang.com/1135936507.html
 5 | http://product.dangdang.com/60326136.html
 6 | http://product.dangdang.com/60217143.html
 7 | http://product.dangdang.com/60275012.html
 8 | http://product.dangdang.com/60095332.html
 9 | http://product.dangdang.com/60296404.html
10 | http://product.dangdang.com/60312091.html
11 | http://product.dangdang.com/1102856607.html
12 | http://product.dangdang.com/1103019307.html
13 | http://product.dangdang.com/60543004.html
14 | http://product.dangdang.com/60561747.html
15 | http://product.dangdang.com/60307526.html
16 | http://product.dangdang.com/60556879.html
17 | http://product.dangdang.com/60079298.html
18 | http://product.dangdang.com/60324171.html
19 | http://product.dangdang.com/60328975.html
20 | http://product.dangdang.com/60563407.html
21 | http://product.dangdang.com/1226033608.html
22 | http://product.dangdang.com/1103073107.html
23 | http://product.dangdang.com/1205883106.html
24 | http://product.dangdang.com/60543166.html
25 | http://product.dangdang.com/60545362.html
26 | http://product.dangdang.com/1219193022.html
27 | http://product.dangdang.com/60329283.html
28 | http://product.dangdang.com/60293398.html
29 | http://product.dangdang.com/60311152.html
30 | http://product.dangdang.com/60542979.html
31 | http://product.dangdang.com/60559721.html
32 | http://product.dangdang.com/1103111307.html
33 | http://product.dangdang.com/60554268.html
34 | http://product.dangdang.com/1104669412.html
35 | http://product.dangdang.com/60313678.html
36 | http://product.dangdang.com/60564171.html
37 | http://product.dangdang.com/1175804808.html
38 | http://product.dangdang.com/60304263.html
39 | http://product.dangdang.com/60542966.html
40 | http://product.dangdang.com/1062967608.html
41 | http://product.dangdang.com/60556746.html
42 | http://product.dangdang.com/60542968.html
43 | http://product.dangdang.com/1137365805.html
44 | http://product.dangdang.com/1025107506.html
45 | http://product.dangdang.com/60571706.html
46 | http://product.dangdang.com/1062940608.html
47 | http://product.dangdang.com/1365805202.html
48 | http://product.dangdang.com/1159089022.html
49 | http://product.dangdang.com/1080116522.html
50 | http://product.dangdang.com/1108646412.html
51 | http://product.dangdang.com/1066589907.html
52 | http://product.dangdang.com/1173284507.html
53 | http://product.dangdang.com/60323221.html
54 | http://product.dangdang.com/1112093507.html
55 | http://product.dangdang.com/60020047.html
56 | http://product.dangdang.com/60079299.html
57 | http://product.dangdang.com/60567912.html
58 | http://product.dangdang.com/60547504.html
59 | 


--------------------------------------------------------------------------------
/jd/read_config.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | import re
 3 | import urllib2
 4 | import urllib
 5 | import time
 6 | import cookielib
 7 | import ConfigParser
 8 | from bs4 import BeautifulSoup
 9 | 
10 | def read_config():
11 |     config = ConfigParser.ConfigParser()
12 |     print config.read('config.ini')
13 | 
14 |     spiders_name = config.sections()
15 | 
16 |     crawled_id_filename = config.get('computer','crawled_id_filename')
17 |     print crawled_id_filename
18 | # read_config()
19 | def read_one_item_id():
20 |     url = 'http://list.jd.com/6233-6236-6254-0-0-0-0-0-0-0-1-1-625-1-1-72-4137-33.html'
21 |     ip_port = '218.207.195.206:80'
22 |     enable_proxy = False
23 |     start_time = time.time()
24 |     http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
25 |     null_http_hanlder = urllib2.ProxyHandler({})
26 |     if enable_proxy:
27 |         opener = urllib2.build_opener(http_hanlder)
28 |     else:
29 |         opener = urllib2.build_opener(null_http_hanlder)
30 |     urllib2.install_opener(opener)
31 |     html = urllib2.urlopen(url).read()
32 |     item_id_list = re.findall(r"sku='(\d+)'><div class=", html)
33 |     print item_id_list
34 |     # soup = BeautifulSoup(html, 'html5lib')
35 |     # print soup.find('div', id='name').text
36 |     # print soup.find('div', id='product-detail-1').text
37 |     end_time = time.time()
38 |     print end_time - start_time
39 | read_one_item_id()
40 | 
41 | def xiaomi():
42 |     login_url = 'https://account.xiaomi.com/pass/serviceLoginAuth2'
43 |     ip_port = '218.203.54.8:80'
44 |     proxy_hanlder = urllib2.ProxyHandler({'https':'https://%s'%ip_port})
45 |     cookiejar = cookielib.CookieJar()
46 |     cookie_hanlder = urllib2.HTTPCookieProcessor(cookiejar)
47 |     opener = urllib2.build_opener(cookie_hanlder, proxy_hanlder)
48 |     post_data = {
49 |                 'user' : 'hexinwei@baiwenbao.com',
50 |                 'pwd' : 'www.komoxo.com',
51 |                 'callback' : 'https://account.xiaomi.com' ,
52 |                 'sid' : 'passport',
53 |                 'hidden' : '',
54 |                 'qs' : '%3Fsid%3Dpassport',
55 |                 '_sign' : 'KKkRvCpZoDC+gLdeyOsdMhwV0Xg='
56 |             }
57 |     post_data = urllib.urlencode(post_data)
58 |     req = urllib2.Request(login_url,post_data)
59 |     response = opener.open(req)
60 |     print response.read()
61 | # xiaomi()
62 | def regular():
63 |     s = 'not match name_str in url;http://item.jd.com/1017375.html'
64 |     print re.search('\d+', s).group()
65 |     print s.split(';')[1].strip()
66 | 
67 | def variable_scope():
68 |     for i in range(10):
69 |         if i == 0:
70 |             var = 'a'
71 |         if i == 2:
72 |             var = 'b'
73 |         if i == 5:
74 |             var = 'c'
75 |         print i,var
76 | # variable_scope()


--------------------------------------------------------------------------------
/jd/read_failed_item_id.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import codecs
 7 | import urllib2
 8 | import ConfigParser
 9 | from bs4 import BeautifulSoup
10 | 
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | SPIDER_IN_CONFIG = 'computer'
13 | CONFIG_PATH = os.path.dirname(PATH)# the path you can read config file!
14 | 
15 | 
16 | config = ConfigParser.ConfigParser()
17 | config_filename = os.path.join(CONFIG_PATH, 'config.ini')
18 | config.read(config_filename)
19 | crawled_id_in_log = config.get('%s'%SPIDER_IN_CONFIG, 'crawled_id_filename')
20 | root_url_pattern = config.get('root', 'root_url_pattern')
21 | item_id_filename = config.get('%s'%SPIDER_IN_CONFIG, 'item_id_filename')
22 | item_content_filename = config.get('%s'%SPIDER_IN_CONFIG, 'item_content_filename')
23 | failed_id_in_log = config.get('%s'%SPIDER_IN_CONFIG, 'failed_id_filename')
24 | 
25 | 
26 | def write_con_into_file(con_list):
27 |     timestamp = time.strftime('%Y_%m_%d_%H_{}'.format(item_content_filename))
28 |     filename = os.path.join(PATH, 'item_id', timestamp)
29 |     with codecs.open(filename, 'a', encoding='utf-8') as wf:
30 |         wf.writelines(con_list)
31 | 
32 | def read_failed_id():
33 |     failed_id_filename = os.path.join(PATH, 'log', failed_id_in_log)
34 |     ip_port = '115.28.50.204:80'
35 |     http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
36 |     opener = urllib2.build_opener(http_hanlder)
37 |     crawled_failed_id = os.path.join(PATH, 'log', 'crawled_failed_id')
38 |     with codecs.open(failed_id_filename, encoding='utf-8') as failed_id_f,\
39 |     codecs.open(crawled_failed_id, mode='a', encoding='utf-8') as crawled_id_af:
40 |         for line in failed_id_f.readlines():
41 |             item_msg_list = []
42 |             url = line.split(';')[1].strip()
43 |             html = opener.open(url).read()
44 |             try:
45 |                 soup = BeautifulSoup(html, 'html5lib')
46 |                 name_str = soup.find('div', id='name')
47 |             except:
48 |                 continue
49 |             if not name_str:
50 |                 continue
51 |             try:
52 |                 item_name =  name_str.text.strip()
53 |                 item_msg_list.append(item_name+'\n')
54 |             except:
55 |                 pass
56 |             detail_info = soup.find('div', id='product-detail-1')
57 |             try:
58 |                 item_info = detail_info.text.strip()
59 |             except:
60 |                 write_con_into_file(item_msg_list)
61 |                 crawled_id_af.write('%s\n'%url)
62 |                 continue
63 |             item_msg_list.append(item_info+'\n')
64 |             write_con_into_file(item_msg_list)
65 |             crawled_id_af.write('%s\n'%url)
66 | read_failed_id()
67 | 


--------------------------------------------------------------------------------
/baidu_word_freq/multi_thread_get_freq.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import codecs
 6 | import time
 7 | import random
 8 | import urllib2
 9 | import gevent
10 | import gevent.monkey
11 | from math import ceil, floor
12 | from bs4 import BeautifulSoup
13 | gevent.monkey.patch_all()
14 | PATH = os.path.dirname(os.path.abspath(__file__))
15 | class FreqCrawler:
16 |     def __init__(self):
17 |         self.words_list = []
18 |         self._gen_word_list()
19 |         self.word_freq_list = []
20 | 
21 |     def _gen_word_list(self):
22 |         filename = os.path.join(PATH, 'sys', 'word_without_freq_0528.txt')
23 |         with open(filename) as f:
24 |             self.words_list.extend([item.strip() for item in f.readlines()])
25 | 
26 |     def read_item_url(self, word):
27 | 
28 |         url_pattern = 'http://www.baidu.com/s?wd="%s"&fr=wenku'
29 |         num_pattern = re.compile(r'\d+')
30 | 
31 |         # for word in self.words_list:
32 |         url = url_pattern%word
33 |         try:
34 |             html = urllib2.urlopen(url, timeout=30).read()
35 |         except:
36 |             try:
37 |                 html = urllib2.urlopen(url, timeout=30).read()
38 |             except:
39 |                 print 'request timed out in url:%s'%url
40 |                 return
41 |         soup = BeautifulSoup(html)
42 |         span_level_str_list = soup.find_all('span', class_='nums')
43 | 
44 |         if span_level_str_list:
45 |             span_level_str = span_level_str_list[0]
46 |             num_text = span_level_str.get_text()
47 |             num = "".join(num_pattern.findall(num_text))
48 |             com_str = "\t".join((word, str(num))) + '\n'
49 |         else:
50 |             com_str = "\t".join((word, "0")) + '\n'
51 |         self.word_freq_list.append(com_str)
52 | 
53 |     def write_wrod_freq(self):
54 |         filename = os.path.join(PATH, 'out', 'local_multi_thread_word_freq')
55 |         with open(filename, mode='a') as wf:
56 |             wf.writelines(self.word_freq_list)
57 | 
58 |     def main(self):
59 | 
60 |         threads = []
61 |         words_count = len(self.words_list)
62 |         thread_count = 10000
63 | 
64 |         words_list_split_section = int(ceil(words_count/float(thread_count)))
65 |         for section_sequence in range(words_list_split_section):
66 |             for word_point in range(thread_count*section_sequence, (section_sequence+1)*thread_count):
67 |                 word = self.words_list[word_point]
68 |                 threads.append(gevent.spawn(self.read_item_url, word))
69 |             gevent.joinall(threads)
70 | 
71 |             self.write_wrod_freq()
72 |             self.word_freq_list[:] = []
73 | 
74 |             time.sleep(2)
75 | 
76 | freqspider = FreqCrawler()
77 | freqspider.main()
78 | 
79 | 


--------------------------------------------------------------------------------
/dangdang/sports/sys/clothing_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1012135505.html
 2 | http://product.dangdang.com/1012080705.html
 3 | http://product.dangdang.com/1297853407.html
 4 | http://product.dangdang.com/1311625806.html
 5 | http://product.dangdang.com/1132240205.html
 6 | http://product.dangdang.com/1035697907.html
 7 | http://product.dangdang.com/1314318607.html
 8 | http://product.dangdang.com/1114969605.html
 9 | http://product.dangdang.com/1094086521.html
10 | http://product.dangdang.com/1078854807.html
11 | http://product.dangdang.com/1088650207.html
12 | http://product.dangdang.com/1162475705.html
13 | http://product.dangdang.com/1083436021.html
14 | http://product.dangdang.com/1051151321.html
15 | http://product.dangdang.com/1064596121.html
16 | http://product.dangdang.com/1118759806.html
17 | http://product.dangdang.com/1264271807.html
18 | http://product.dangdang.com/1038699708.html
19 | http://product.dangdang.com/1264364307.html
20 | http://product.dangdang.com/1088668807.html
21 | http://product.dangdang.com/1276380105.html
22 | http://product.dangdang.com/1154795208.html
23 | http://product.dangdang.com/1260640608.html
24 | http://product.dangdang.com/1270156907.html
25 | http://product.dangdang.com/1182221508.html
26 | http://product.dangdang.com/1230306508.html
27 | http://product.dangdang.com/1219681107.html
28 | http://product.dangdang.com/1190667608.html
29 | http://product.dangdang.com/1442569108.html
30 | http://product.dangdang.com/1285227308.html
31 | http://product.dangdang.com/1456602208.html
32 | http://product.dangdang.com/1098143111.html
33 | http://product.dangdang.com/1180639805.html
34 | http://product.dangdang.com/1256439108.html
35 | http://product.dangdang.com/1189912311.html
36 | http://product.dangdang.com/1199640122.html
37 | http://product.dangdang.com/1265501307.html
38 | http://product.dangdang.com/1196122211.html
39 | http://product.dangdang.com/1200987422.html
40 | http://product.dangdang.com/1302212906.html
41 | http://product.dangdang.com/1222455022.html
42 | http://product.dangdang.com/1228458022.html
43 | http://product.dangdang.com/1262657806.html
44 | http://product.dangdang.com/1247744506.html
45 | http://product.dangdang.com/1227795206.html
46 | http://product.dangdang.com/1463602508.html
47 | http://product.dangdang.com/1170608611.html
48 | http://product.dangdang.com/1153672708.html
49 | http://product.dangdang.com/1264820307.html
50 | http://product.dangdang.com/1130304407.html
51 | http://product.dangdang.com/1191252322.html
52 | http://product.dangdang.com/1284877606.html
53 | http://product.dangdang.com/1195006622.html
54 | http://product.dangdang.com/1300604105.html
55 | http://product.dangdang.com/1314317407.html
56 | http://product.dangdang.com/1249293405.html
57 | http://product.dangdang.com/1251436305.html
58 | http://product.dangdang.com/1202294222.html
59 | 


--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/glass_watch/sys/glass_watch_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1151272511.html
 2 | http://product.dangdang.com/1064866006.html
 3 | http://product.dangdang.com/1349716101.html
 4 | http://product.dangdang.com/1179184111.html
 5 | http://product.dangdang.com/1017009307.html
 6 | http://product.dangdang.com/1257773806.html
 7 | http://product.dangdang.com/1121110311.html
 8 | http://product.dangdang.com/1066332506.html
 9 | http://product.dangdang.com/1207797701.html
10 | http://product.dangdang.com/1137460505.html
11 | http://product.dangdang.com/1139506805.html
12 | http://product.dangdang.com/1150937811.html
13 | http://product.dangdang.com/1151163011.html
14 | http://product.dangdang.com/1150890911.html
15 | http://product.dangdang.com/1115083011.html
16 | http://product.dangdang.com/1139495905.html
17 | http://product.dangdang.com/1179870506.html
18 | http://product.dangdang.com/1114453711.html
19 | http://product.dangdang.com/1150919411.html
20 | http://product.dangdang.com/1064675206.html
21 | http://product.dangdang.com/1288485608.html
22 | http://product.dangdang.com/1211512201.html
23 | http://product.dangdang.com/1166169421.html
24 | http://product.dangdang.com/1064997606.html
25 | http://product.dangdang.com/1229202608.html
26 | http://product.dangdang.com/1170321611.html
27 | http://product.dangdang.com/1303053708.html
28 | http://product.dangdang.com/1450537201.html
29 | http://product.dangdang.com/1095087521.html
30 | http://product.dangdang.com/1180075306.html
31 | http://product.dangdang.com/1130592721.html
32 | http://product.dangdang.com/1073440211.html
33 | http://product.dangdang.com/1180073806.html
34 | http://product.dangdang.com/1100715811.html
35 | http://product.dangdang.com/1072840206.html
36 | http://product.dangdang.com/1169093511.html
37 | http://product.dangdang.com/1211432406.html
38 | http://product.dangdang.com/1031560822.html
39 | http://product.dangdang.com/1430726508.html
40 | http://product.dangdang.com/1259657407.html
41 | http://product.dangdang.com/1074535921.html
42 | http://product.dangdang.com/1124428921.html
43 | http://product.dangdang.com/1132443411.html
44 | http://product.dangdang.com/1131337811.html
45 | http://product.dangdang.com/1026744511.html
46 | http://product.dangdang.com/1481587508.html
47 | http://product.dangdang.com/1378006908.html
48 | http://product.dangdang.com/1027525301.html
49 | http://product.dangdang.com/1176704001.html
50 | http://product.dangdang.com/1261681801.html
51 | http://product.dangdang.com/1112919811.html
52 | http://product.dangdang.com/1184593811.html
53 | http://product.dangdang.com/1060832401.html
54 | http://product.dangdang.com/1222150208.html
55 | http://product.dangdang.com/1246802807.html
56 | http://product.dangdang.com/1059882401.html
57 | http://product.dangdang.com/1024968001.html
58 | http://product.dangdang.com/1027410701.html
59 | 


--------------------------------------------------------------------------------
/dangdang/book/sys/topic_url:
--------------------------------------------------------------------------------
 1 | http://category.dangdang.com/cp01.05.00.00.00.00.html
 2 | http://category.dangdang.com/cp01.43.00.00.00.00.html
 3 | http://category.dangdang.com/cp01.22.00.00.00.00.html
 4 | http://category.dangdang.com/cp01.21.00.00.00.00.html
 5 | http://category.dangdang.com/cp01.36.00.00.00.00.html
 6 | http://category.dangdang.com/cp01.07.00.00.00.00.html
 7 | http://category.dangdang.com/cp01.28.00.00.00.00.html
 8 | http://category.dangdang.com/cp01.03.00.00.00.00.html
 9 | http://category.dangdang.com/cp01.45.00.00.00.00.html
10 | http://category.dangdang.com/cp01.54.00.00.00.00.html
11 | http://category.dangdang.com/cp01.47.00.00.00.00.html
12 | http://category.dangdang.com/cp01.38.00.00.00.00.html
13 | http://category.dangdang.com/cp01.56.00.00.00.00.html
14 | http://category.dangdang.com/cp01.41.00.00.00.00.html
15 | http://category.dangdang.com/cp01.09.00.00.00.00.html
16 | http://category.dangdang.com/cp01.25.00.00.00.00.html
17 | http://category.dangdang.com/cp01.27.00.00.00.00.html
18 | http://category.dangdang.com/cp01.31.00.00.00.00.html
19 | http://category.dangdang.com/cp01.18.00.00.00.00.html
20 | http://category.dangdang.com/cp01.34.00.00.00.00.html
21 | http://category.dangdang.com/cp01.01.00.00.00.00.html
22 | http://category.dangdang.com/cp01.30.00.00.00.00.html
23 | http://category.dangdang.com/cp01.10.00.00.00.00.html
24 | http://category.dangdang.com/cp01.32.00.00.00.00.html
25 | http://category.dangdang.com/cp01.17.00.00.00.00.html
26 | http://category.dangdang.com/cp01.55.00.00.00.00.html
27 | http://category.dangdang.com/cp01.12.00.00.00.00.html
28 | http://category.dangdang.com/cp01.49.00.00.00.00.html
29 | http://category.dangdang.com/cp01.63.00.00.00.00.html
30 | http://category.dangdang.com/cp01.24.00.00.00.00.html
31 | http://category.dangdang.com/cp01.06.00.00.00.00.html
32 | http://category.dangdang.com/cp01.52.00.00.00.00.html
33 | http://category.dangdang.com/cp01.26.00.00.00.00.html
34 | http://category.dangdang.com/cp01.15.00.00.00.00.html
35 | http://category.dangdang.com/cp01.11.00.00.00.00.html
36 | http://category.dangdang.com/cp01.16.00.00.00.00.html
37 | http://category.dangdang.com/cp01.62.00.00.00.00.html
38 | http://category.dangdang.com/cp01.04.00.00.00.00.html
39 | http://category.dangdang.com/cp01.20.00.00.00.00.html
40 | http://category.dangdang.com/cp01.58.00.00.00.00.html
41 | http://category.dangdang.com/cp01.19.00.00.00.00.html
42 | http://category.dangdang.com/cp01.66.00.00.00.00.html
43 | http://category.dangdang.com/cp01.14.00.00.00.00.html
44 | http://category.dangdang.com/cp01.50.00.00.00.00.html
45 | http://category.dangdang.com/cp01.23.00.00.00.00.html
46 | http://category.dangdang.com/cp01.59.00.00.00.00.html
47 | http://category.dangdang.com/cp01.71.00.00.00.00.html
48 | http://category.dangdang.com/cp01.69.00.00.00.00.html
49 | http://category.dangdang.com/cp01.70.00.00.00.00.html
50 | 


--------------------------------------------------------------------------------
/baidu_word_freq/clear_data.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import codecs
 5 | import re
 6 | 
 7 | path = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | def pure_freq():
10 | 
11 |     filename = os.path.join(path, 'out/baidu_word_freq')
12 |     if not os.path.isfile(filename):
13 |         raise ValueError('No such file:%s'%filename)
14 |     word_freq_dic  = {}
15 |     with codecs.open(filename,encoding="utf-8") as f:
16 |         for line in f.readlines():
17 |             line = line.strip()
18 |             splited_line = line.split("\t")
19 |             if len(splited_line) is not 2:
20 |                 print len(splited_line),splited_line
21 |                 raise ValueError("length is splited line is not 2 in file:%s"%filename)
22 |             word = splited_line[0]
23 |             freq_comma = splited_line[1]
24 |             word_freq_dic[word] = freq_comma
25 | 
26 |     pure_freq_list = ["".join(re.findall("[\d]+", item)) for item in word_freq_dic.values()]
27 | 
28 |     word_freq_list = ['\t'.join(item)+'\n' for item in map(lambda x,y:(x,y), word_freq_dic.keys(),pure_freq_list)]
29 |     # com_str_list = ['\t'.join(item)+'\n' for item in word_freq_list]
30 |     # for word_fre in word_freq_list:
31 |     #     print word_fre
32 |     with codecs.open('baidu_HZOut.txt', mode='wb', encoding="utf-8")as wf:
33 |         wf.writelines(word_freq_list)
34 | # pure_freq()
35 | 
36 | def make_word_in_order():
37 |     filename = os.path.join(path, 'sys/HZOut.txt')
38 |     word_in_order_list = []
39 |     temp_word_set = set()
40 |     if not os.path.isfile(filename):
41 |         raise ValueError('no such file:%s'%filename)
42 |     with codecs.open(filename, encoding="utf-16") as f:
43 |         for line in f.readlines():
44 |             line = line.strip()
45 |             if not line.startswith(';'):
46 |                 splited_line = line.split("\t")
47 |                 word = splited_line[0]
48 |                 if not word in temp_word_set:
49 |                     word_in_order_list.append(word)
50 |                 temp_word_set.add(word)
51 |     # print len(word_in_order_list), len(temp_word_set)#20870
52 | 
53 |     baidu_freq_file = os.path.join(path, 'sys/baidu_HZOut.txt')
54 |     pure_word_freq_dic = {}
55 |     with codecs.open(baidu_freq_file, encoding="utf-8") as baidu:
56 |         for line in baidu.readlines():
57 |             splited_line = line.split('\t')
58 |             word = splited_line[0]
59 |             freq = splited_line[1]
60 |             pure_word_freq_dic[word] = freq
61 | 
62 | 
63 |     com_str_list = ["\t".join((item, pure_word_freq_dic[item])) for item in word_in_order_list]
64 |     print len(com_str_list)
65 | 
66 |     write_filename = os.path.join(path, "baidu_words_freq")
67 |     with codecs.open(write_filename, mode="wb", encoding="utf-8") as wf:
68 |         wf.writelines(com_str_list)
69 | 
70 | make_word_in_order()
71 | 
72 | 


--------------------------------------------------------------------------------
/baidu/tst_bd.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #encoding:utf-8
 3 | 
 4 | import os
 5 | import re
 6 | import time
 7 | import gevent
 8 | import codecs
 9 | import random
10 | import urllib2
11 | import simplejson
12 | import gevent.monkey
13 | from bs4 import BeautifulSoup
14 | from math import ceil
15 | # gevent.monkey.patch_all()
16 | 
17 | PATH = os.path.dirname(__file__)
18 | 
19 | def parse_html(url,filecount,proxy):
20 |     content_json_dict = {}
21 |     http_proxy = 'http://%s'%proxy
22 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
23 |     opener = urllib2.build_opener(proxy_hanlder)
24 |     urllib2.install_opener(opener)
25 |     try:
26 |         html = urllib2.urlopen(url).read()
27 |     except:
28 |         print 'request timed item_id ...in url:%s'%url
29 |         return
30 |     soup = BeautifulSoup(html)
31 |     header_str = soup.find('div', class_='lemmaTitleH1')
32 |     if not header_str:
33 |         print 'url not match pattern:%s'%url
34 |         return
35 |     content_json_dict['header'] = header_str.text
36 |     content_json_dict['url'] = url
37 |     para_level_list = soup.find_all('div', class_='para')
38 |     para_str_list =[para.text for para in  para_level_list]
39 |     content_str = "".join(para_str_list)
40 |     content_json_dict['content'] = content_str
41 |     json_obj = simplejson.dumps(content_json_dict)
42 |     filename = os.path.join(PATH, 'out', filecount)
43 |     with codecs.open('%s'%filename, mode='wb', encoding='utf-8')as f:
44 |         f.write(json_obj)
45 | # parse_html()
46 | 
47 | def read_with_proxy(section_count):
48 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
49 |     with open(filename) as f:
50 |         proxy_ip_list = f.readlines()
51 |     proxy_ip_list = [item.strip() for item in proxy_ip_list]
52 |     proxy_count = len(proxy_ip_list)
53 |     start = time.time()
54 |     threads = []
55 |     url_pattern = "http://baike.baidu.com/view/%s.htm"
56 | 
57 |     thread_count = 200
58 |     threads_per_proxy = int(ceil(thread_count/float(proxy_count)))
59 | 
60 |     for i in range(1+thread_count*section_count, thread_count*(section_count+1)+1):
61 |         url = url_pattern%str(i)
62 |         proxy_point = (i-thread_count*section_count)/threads_per_proxy
63 |         ip_port = proxy_ip_list[proxy_point]
64 |         threads.append(gevent.spawn(parse_html, url, str(i), ip_port))
65 |     gevent.joinall(threads,timeout=90)
66 |     end = time.time()
67 |     print "elapsed time : %d" %(end-start)
68 | 
69 | # read_with_proxy()
70 | def baike_content():
71 |     n = 100000000/200
72 |     for i in range(n):
73 |         read_with_proxy(i)
74 |     time.sleep(random.randint(5,8))
75 | # baike_content()
76 | def parse_page_url():
77 |     url = "http://tieba.baidu.com/f?kw=%B5%F6%D3%E3"
78 |     html = urllib2.urlopen(url).read().decode('gbk')
79 |     soup = BeautifulSoup(html)
80 |     div_level_str = soup.find('ul', id='thread_list')
81 |     # print div_level_str
82 |     li_level_list = div_level_str.find_all('li', class_='j_thread_list clearfix')
83 |     # print li_level_list, len(li_level_list)
84 |     print li_level_list[0].find('a', class_='j_th_tit')
85 | parse_page_url()


--------------------------------------------------------------------------------
/sohu/sys/realtime_page_url:
--------------------------------------------------------------------------------
 1 | http://club.news.sohu.com/minjian/threads/p1?type=all&order=rtime
 2 | http://club.news.sohu.com/minjian/threads/p2?type=all&order=rtime
 3 | http://club.news.sohu.com/minjian/threads/p3?type=all&order=rtime
 4 | http://club.news.sohu.com/minjian/threads/p4?type=all&order=rtime
 5 | http://club.news.sohu.com/minjian/threads/p5?type=all&order=rtime
 6 | http://club.news.sohu.com/minjian/threads/p6?type=all&order=rtime
 7 | http://club.news.sohu.com/minjian/threads/p7?type=all&order=rtime
 8 | http://club.news.sohu.com/minjian/threads/p8?type=all&order=rtime
 9 | http://club.news.sohu.com/minjian/threads/p9?type=all&order=rtime
10 | http://club.news.sohu.com/minjian/threads/p10?type=all&order=rtime
11 | http://club.news.sohu.com/minjian/threads/p11?type=all&order=rtime
12 | http://club.news.sohu.com/minjian/threads/p12?type=all&order=rtime
13 | http://yule.club.sohu.com/bagua/threads/p1?type=all&order=rtime
14 | http://yule.club.sohu.com/bagua/threads/p2?type=all&order=rtime
15 | http://yule.club.sohu.com/bagua/threads/p3?type=all&order=rtime
16 | http://club.women.sohu.com/shopping/threads/p1?type=all&order=rtime
17 | http://club.women.sohu.com/shopping/threads/p2?type=all&order=rtime
18 | http://club.women.sohu.com/shopping/threads/p3?type=all&order=rtime
19 | http://club.women.sohu.com/shopping/threads/p4?type=all&order=rtime
20 | http://club.women.sohu.com/shopping/threads/p5?type=all&order=rtime
21 | http://club.women.sohu.com/shopping/threads/p6?type=all&order=rtime
22 | http://club.women.sohu.com/shopping/threads/p7?type=all&order=rtime
23 | http://club.cul.sohu.com/chuyushe/threads/p1?type=all&order=rtime
24 | http://club.cul.sohu.com/chuyushe/threads/p2?type=all&order=rtime
25 | http://club.cul.sohu.com/chuyushe/threads/p3?type=all&order=rtime
26 | http://club.baobao.sohu.com/mom_daugh/threads/p1?type=all&order=rtime
27 | http://club.baobao.sohu.com/mom_daugh/threads/p2?type=all&order=rtime
28 | http://club.baobao.sohu.com/mom_daugh/threads/p3?type=all&order=rtime
29 | http://club.baobao.sohu.com/mom_daugh/threads/p4?type=all&order=rtime
30 | http://club.baobao.sohu.com/mom_daugh/threads/p5?type=all&order=rtime
31 | http://club.baobao.sohu.com/mom_daugh/threads/p6?type=all&order=rtime
32 | http://club.sports.sohu.com/allgame/threads/p1?type=all&order=rtime
33 | http://club.sports.sohu.com/allgame/threads/p2?type=all&order=rtime
34 | http://club.chihe.sohu.com/food/threads/p1?type=all&order=rtime
35 | http://club.travel.sohu.com/togher/threads/p1?type=all&order=rtime
36 | http://club.learning.sohu.com/szhengzhi/threads/p1?type=all&order=rtime
37 | http://club.learning.sohu.com/szhengzhi/threads/p2?type=all&order=rtime
38 | http://club.learning.sohu.com/szhengzhi/threads/p3?type=all&order=rtime
39 | http://club.learning.sohu.com/szhengzhi/threads/p4?type=all&order=rtime
40 | http://club.stock.sohu.com/stock/threads/p1?type=all&order=rtime
41 | http://club.women.sohu.com/zz0894/threads/p1?type=all&order=rtime
42 | http://club.women.sohu.com/zz0894/threads/p2?type=all&order=rtime
43 | http://club.women.sohu.com/zz0894/threads/p3?type=all&order=rtime
44 | http://club.women.sohu.com/zz0894/threads/p4?type=all&order=rtime
45 | http://club.health.sohu.com/nutrition/threads/p1?type=all&order=rtime
46 | http://club.health.sohu.com/nutrition/threads/p2?type=all&order=rtime
47 | 


--------------------------------------------------------------------------------
/dangdang/book/sys/item_id_url:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/20689512.html
 2 | http://product.dangdang.com/22924035.html
 3 | http://product.dangdang.com/23445223.html
 4 | http://product.dangdang.com/23443584.html
 5 | http://product.dangdang.com/21127786.html
 6 | http://product.dangdang.com/23415029.html
 7 | http://product.dangdang.com/23202545.html
 8 | http://product.dangdang.com/23230089.html
 9 | http://product.dangdang.com/23196331.html
10 | http://product.dangdang.com/9266019.html
11 | http://product.dangdang.com/23347131.html
12 | http://product.dangdang.com/23377894.html
13 | http://product.dangdang.com/20856746.html
14 | http://product.dangdang.com/23257461.html
15 | http://product.dangdang.com/23040215.html
16 | http://product.dangdang.com/21127783.html
17 | http://product.dangdang.com/23280077.html
18 | http://product.dangdang.com/23312919.html
19 | http://product.dangdang.com/23462460.html
20 | http://product.dangdang.com/21055821.html
21 | http://product.dangdang.com/23393558.html
22 | http://product.dangdang.com/23323608.html
23 | http://product.dangdang.com/23301995.html
24 | http://product.dangdang.com/23252749.html
25 | http://product.dangdang.com/23454534.html
26 | http://product.dangdang.com/22859699.html
27 | http://product.dangdang.com/23426820.html
28 | http://product.dangdang.com/23448752.html
29 | http://product.dangdang.com/23466694.html
30 | http://product.dangdang.com/23197013.html
31 | http://product.dangdang.com/23401231.html
32 | http://product.dangdang.com/23339791.html
33 | http://product.dangdang.com/23435432.html
34 | http://product.dangdang.com/23448786.html
35 | http://product.dangdang.com/23472161.html
36 | http://product.dangdang.com/23391113.html
37 | http://product.dangdang.com/23414971.html
38 | http://product.dangdang.com/21127784.html
39 | http://product.dangdang.com/23251289.html
40 | http://product.dangdang.com/20920745.html
41 | http://product.dangdang.com/22909637.html
42 | http://product.dangdang.com/20767133.html
43 | http://product.dangdang.com/23407125.html
44 | http://product.dangdang.com/23331621.html
45 | http://product.dangdang.com/23456570.html
46 | http://product.dangdang.com/23324182.html
47 | http://product.dangdang.com/23469478.html
48 | http://product.dangdang.com/22505861.html
49 | http://product.dangdang.com/22883486.html
50 | http://product.dangdang.com/22883485.html
51 | http://product.dangdang.com/23449181.html
52 | http://product.dangdang.com/20996321.html
53 | http://product.dangdang.com/23366069.html
54 | http://product.dangdang.com/23237148.html
55 | http://product.dangdang.com/22928206.html
56 | http://product.dangdang.com/22912341.html
57 | http://product.dangdang.com/21065161.html
58 | http://product.dangdang.com/23317693.html
59 | http://product.dangdang.com/22750023.html
60 | http://product.dangdang.com/23421668.html
61 | http://product.dangdang.com/23396738.html
62 | http://product.dangdang.com/21127782.html
63 | http://product.dangdang.com/21127787.html
64 | http://product.dangdang.com/22727496.html
65 | http://product.dangdang.com/22910622.html
66 | http://product.dangdang.com/21112305.html
67 | http://product.dangdang.com/23350012.html
68 | http://product.dangdang.com/1115507906.html
69 | http://product.dangdang.com/22892522.html
70 | http://product.dangdang.com/20937459.html
71 | http://product.dangdang.com/22749480.html
72 | http://product.dangdang.com/23447641.html
73 | 


--------------------------------------------------------------------------------
/douban/douban_book/douban_books.py:
--------------------------------------------------------------------------------
 1 | #!-*- coding:utf-8 -*-
 2 | import re
 3 | import json
 4 | import codecs
 5 | import random
 6 | import time
 7 | import logging
 8 | from bs4 import BeautifulSoup
 9 | from result_db import db
10 | from pyspider.libs.base_handler import *
11 | logger = logging.getLogger("douban")
12 | 
13 | 
14 | class Handler(BaseHandler):
15 | 
16 |     crawl_config = {
17 |     }
18 | 
19 |     @every(minutes=24 * 60)
20 |     def on_start(self):
21 |         start_url = 'https://book.douban.com/tag/'
22 |         self.crawl(start_url, callback=self.topic_page, force_update=True)
23 | 
24 |     @config(age=3 * 24 * 60 * 60)
25 |     def topic_page(self, response):
26 |         html = response.content
27 |         soup = BeautifulSoup(html, 'lxml')
28 |         tag_cols = soup.find_all('table', class_='tagCol')
29 |         root_url = 'https://book.douban.com'
30 |         for tag_col in tag_cols:
31 |             td_list = tag_col.find_all('td')
32 |             tag_url_list = [root_url+item.find('a').get('href') for item in td_list if item]
33 |             for tag_url in tag_url_list:
34 |                 for index in range(50):  # 每个topic可以翻50页，每页20条数据
35 |                     tag_page_url = '%s?start=%s&type=S' % (tag_url, index*20)
36 |                     time.sleep(random.randint(0, 3))
37 |                     self.crawl(tag_page_url, callback=self.tag_detail_page, force_update=True)
38 | 
39 |     @config(priority=2)
40 |     def tag_detail_page(self, response):
41 |         try:
42 |             content = response.content
43 |             soup = BeautifulSoup(content, 'lxml')
44 |             subject_item_list = soup.find_all('li', class_='subject-item')
45 |             for subject_item in subject_item_list:
46 |                 # subject_item = soup.find('li', class_='subject-item')
47 |                 score = subject_item.find('span', class_='rating_nums').text
48 |                 comment_count = re.search('\d+', subject_item.find('span', class_='pl').text).group()
49 |                 book_name = subject_item.find('h2', class_="").text.strip()
50 |                 book_id_url = subject_item.find('h2', class_="").a.get('href')
51 |                 book_id = re.search('\d+', book_id_url).group() if book_id_url else ''
52 |                 pub_info_list = subject_item.find('div', class_='pub').text.split('/')
53 |                 author_name, publish_time = pub_info_list[0], pub_info_list[-2]
54 |                 data = {
55 |                     'book': book_name,
56 |                     'book_id': book_id,
57 |                     'author_name': author_name.strip(),
58 |                     'score': score,
59 |                     'comment_count':comment_count,
60 |                     'publish_time': publish_time
61 |                 }
62 |                 return data
63 |         except BaseException as e:
64 |             logger.error('=--------------err_msg=%s' % e.message)
65 | 
66 |     def on_result(self, result):
67 |         super(Handler, self).on_result(result)
68 |         logger.info('------------------------%s' % result)
69 |         assert self.task['taskid']
70 |         assert self.task['project']
71 |         assert self.task['url']
72 |         if not result:
73 |             return
74 |         try:
75 |             db._replace(**result)
76 |         except BaseException as e:
77 |             logger.info('exec db failed, err_msg=%s' % e)


--------------------------------------------------------------------------------
/baidu/filter_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import time
 7 | import codecs
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | VARIFY_URL = "http://42.96.192.46/echo_ip"
12 | 
13 | def Varify_proxy(ip_port):
14 |     http_proxy = 'http://%s'%ip_port
15 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
16 |     opener = urllib2.build_opener(proxy_hanlder)
17 |     urllib2.install_opener(opener)
18 |     try:
19 |         ip = urllib2.urlopen(VARIFY_URL, timeout=10).read().strip()
20 |         if ip == ip_port.split(':')[0]:
21 |             print 'successed ip : %s'%ip
22 |             return True
23 |         else:
24 |             print 'failed ip : %s'%ip
25 |             return
26 |     except:
27 |         print 'timeout request...ip:%s'%ip_port
28 |         return
29 | # Varify_proxy('14.1.43.82:80')
30 | 
31 | def request_in_baidu(ip_port):
32 |     url = 'http://baike.baidu.com/view/3570272.htm'
33 |     http_proxy = 'http://%s'%ip_port
34 |     proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
35 |     opener = urllib2.build_opener(proxy_hanlder)
36 |     urllib2.install_opener(opener)
37 |     try:
38 |         html = urllib2.urlopen(url, timeout=10)
39 |         soup = BeautifulSoup(html)
40 |         para_level_list = soup.find_all('div', class_='para')
41 |         if not para_level_list:
42 |             return
43 |         else:
44 |             return True
45 |     except:
46 |         print 'timed item_id...in ip_port:%s'%ip_port
47 |         return
48 | # print request_in_baidu('111.47.92.226:8080')
49 | def gen_proxy_ip():
50 | 
51 |     # url = "http://www.youdaili.cn/Daili/guonei/2039.html"
52 |     url_list = ["http://www.youdaili.cn/Daili/guonei/2039.html", "http://www.youdaili.cn/Daili/guonei/2039_2.html"
53 |     "http://www.youdaili.cn/Daili/guonei/2039_3.html"]
54 |     for url in url_list:
55 |         try:
56 |             html = urllib2.urlopen(url, timeout=10).read().decode('utf-8')
57 |             print html
58 |             soup = BeautifulSoup(html)
59 |         except :
60 |             print 'request timed item_id...'
61 |             continue
62 | 
63 |     #     div_level = soup.find('div', 'cont_font')
64 |     #     proxy_text = div_level.p.text
65 |     #     # html = '''1.179.147.2:8080@HTTP#泰国
66 |     #     #         14.18.242.147:8080@HTTP#广东省广州市 电信'''
67 |     #
68 |     #     ip_port_list = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:[\d]+', proxy_text)
69 |     #
70 |     #     for ip_port in ip_port_list:
71 |     #         if Varify_proxy(ip_port):
72 |     #             if request_in_baidu(ip_port):
73 |     #                 proxy_list.append(ip_port)
74 |     #             else:
75 |     #                 continue
76 |     #         else:
77 |     #             continue
78 |     #
79 |     # filename = os.path.join(PATH, 'page_url', 'proxy_ip_port')
80 |     # proxy_list = [item+'\n' for item in proxy_list]
81 |     # with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
82 |     #     wf.writelines(proxy_list)
83 | 
84 | # gen_proxy_ip()
85 | def xici_proxy_check():
86 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
87 |     with open(filename) as f:
88 |         for port_ip in [item.strip() for  item in f.readlines()]:
89 |             if request_in_baidu(port_ip):
90 |                 print 'success :%s'%port_ip
91 |             else:
92 |                 print 'invalid ip_port:%s'%port_ip
93 | xici_proxy_check()


--------------------------------------------------------------------------------
/douban/douban_single_thread.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import codecs
 7 | import urllib2
 8 | import random
 9 | from bs4 import BeautifulSoup
10 | 
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | SLEEP_INTERVAL = random.randint(5,8)
13 | class DoubanHistory:
14 |     def __init__(self):
15 |         self.item_url_list = []
16 |         self.timeout_url_list = []
17 |         self.movie_info_list = []
18 |         self._gen_item_url()
19 | 
20 |     def _gen_item_url(self):
21 |         filename = os.path.join(PATH, 'sys', 'total_item_url')
22 |         with codecs.open(filename, encoding='utf-8') as f:
23 |             self.item_url_list.extend([item.strip() for item in f.readlines()])
24 | 
25 |     def write_content_into_file(self):
26 |         timestamp = time.strftime("%Y_%m_%d_%H%M%S.txt")
27 |         filename = os.path.join(PATH, 'out', timestamp)
28 |         with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
29 |             temp_list = ["".join((item, '\n')) for item in self.movie_info_list]
30 |             wf.writelines(temp_list)
31 | 
32 |     def main(self):
33 |         #解析一个页面结构
34 |         # url = 'http://movie.douban.com/subject/20326665/'
35 |         for url in self.item_url_list:
36 |             try:
37 |                 html = urllib2.urlopen(url, timeout=15).read()
38 |             except:
39 |                 time.sleep(60)
40 |                 try:
41 |                     html = urllib2.urlopen(url, timeout=15).read()
42 |                 except:
43 |                     self.timeout_url_list.append(url)
44 |                     continue
45 |             soup = BeautifulSoup(html)
46 | 
47 |             #电影名称
48 |             try:
49 |                 title = soup.find('span', property='v:itemreviewed').text
50 |                 # print title
51 |                 self.movie_info_list.append(title)
52 |             except:
53 |                 self.timeout_url_list.append(url)
54 |                 continue
55 | 
56 |             #演员名称
57 |             try:
58 |                 div_level_str = soup.find('div', id='info')
59 |                 a_level_list = div_level_str.find_all('a', text=re.compile('[^\w]+'))
60 |                 actors_list = [item.text for item in a_level_list]
61 |                 # print actors_list
62 |                 self.movie_info_list.extend(actors_list)
63 |             except:
64 |                 pass
65 | 
66 |             #电影简介
67 |             con_div_level_str = soup.find('div', id='link-report')
68 |             #有电影简介
69 |             if con_div_level_str:
70 |                 #页面没有展开信息
71 |                 further_content = con_div_level_str.select('span[class=""]')
72 |                 if further_content:
73 |                    movie_content = further_content[0].text
74 |                 else:
75 |                     movie_content = con_div_level_str.find('span', class_='all hidden').text
76 |                 # print movie_content
77 |                 self.movie_info_list.append(movie_content.strip())
78 | 
79 |                 if len(self.movie_info_list) > 5000:
80 |                     self.write_content_into_file()
81 |                     self.movie_info_list[:] = []
82 |             time.sleep(SLEEP_INTERVAL)
83 | 
84 |         if self.timeout_url_list:
85 |             filename = os.path.join(PATH, 'log', 'timeout_item_urls')
86 |             with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
87 |                 temp_list = ["".join((item, '\n')) for item in self.timeout_url_list]
88 |                 wf.writelines(temp_list)
89 | 
90 | douban = DoubanHistory()
91 | douban.main()


--------------------------------------------------------------------------------
/baidu_word_freq/get_word_freq.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import urllib2
 4 | import os
 5 | import re
 6 | import codecs
 7 | import time
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | class CrawlFreq:
12 | 
13 |     def __init__(self):
14 |         self.words_list = []
15 |         self._gen_words_list()
16 | 
17 |     def _gen_words_list(self):
18 |         filename = os.path.join(PATH, 'sys', 'wrod_without_freq.txt')
19 |         with codecs.open(filename) as f:
20 |             temp_word_list = [item.strip() for item in f.readlines()]
21 |             self.words_list.extend(temp_word_list)
22 | 
23 |     def read_item_url(self):
24 |         timeout_url_list = []
25 |         url_pattern = 'http://www.baidu.com/s?wd="%s"&fr=wenku'
26 |         num_pattern = re.compile(r'\d+')
27 |         filename = os.path.join(PATH, 'out', 'words_freq_0528')
28 |         with codecs.open(filename, mode='a')as f:
29 |             self.words_list = [u'\u67cf\u79c0\u6f2b\u753b', u'\u5e2e\u4ec0\u4e48\u5fd9\u6bcf\u4e2a', u'\u5305\u5c71\u8001\u5988', u'\u523a\u9752\u6709\u8138', u'\u5361\u9e7f\u519c\u5e84', u'\u8ba9\u6cea\u5316\u4f5c\u76f8\u601d\u5440', u'\u5916\u59d3\u5144\u5f1f', u'\u665a\u4e0a\u4e0d\u7761', u'\u559c\u6b22\u505a\u7684', u'\u559c\u9a6c\u62c9\u96c5\u5c71', u'\u76f8\u8c8c\u6b66\u50e7', u'\u6cbf\u6c5f\u5927\u9053', u'\u4e00\u6839\u5934\u53d1', u'\u4e00\u89c9\u7761\u9192', u'\u4e00\u8def\u6b4c\u5531', u'\u4e00\u76f4\u4e0b\u53bb', u'\u4e00\u5b57\u4e00\u952e\u7ae0\u9c7c\u8f93\u5165', u'\u610f\u601d\u4e00\u4e0b', u'\u4f18\u8c08\u5b9d\u5b9d\u4e0d\u9519\u7684', u'\u6709\u8bdd\u597d\u597d\u8bf4', u'\u53c8\u4e0d\u597d\u5403', u'\u53c8\u4e0d\u80fd\u5403', u'\u4e0e\u4e0d\u7b11\u732b\u60f3\u53bb', u'\u5728\u7ed9\u6211\u8bf4', u'\u5728\u5bb6\u5462\u554a', u'\u5728\u54ea\u91cc\u4f4f', u'\u7cdf\u8001\u5934\u5b50', u'\u600e\u4e48\u4e0d\u56de\u6211', u'\u600e\u4e48\u8fc7\u53bb', u'\u600e\u4e48\u4e00\u4e0b', u'\u627e\u5230\u6ca1\u6709', u'\u627e\u5230\u4f60\u4e86', u'\u8fd9\u4e48\u660e\u663e', u'\u81ea\u5c0a\u5b59\u5b59', u'\u6700\u6f6e\u6728\u4e43\u4f0a', u'\u6700\u8001\u60c5\u503a', u'\u6700\u5f3a\u723d\u7ea6']
30 |             for word in self.words_list:
31 |                 url = url_pattern%word.encode('gbk')
32 |                 try:
33 |                     html = urllib2.urlopen(url, timeout=15).read()
34 |                 except:
35 |                     try:
36 |                         html = urllib2.urlopen(url, timeout=15).read()
37 |                     except:
38 |                         timeout_url_list.append(url)
39 |                         continue
40 |                 soup = BeautifulSoup(html)
41 |                 span_level_str_list = soup.find_all('span', class_='nums')
42 |                 if span_level_str_list:
43 |                     span_level_str = span_level_str_list[0]
44 |                     num_text = span_level_str.get_text()
45 |                     num = "".join(num_pattern.findall(num_text))
46 |                     com_str = "\t".join((word, str(num))) + '\n'
47 |                 else:
48 |                     com_str = "\t".join((word, "0")) + '\n'
49 |                 print com_str.strip()
50 |                 # f.write(com_str.encode('gbk'))
51 |                 # time.sleep(2)
52 |                 
53 |         # if timeout_url_list:
54 |         #     timeout_filename = os.path.join(PATH, 'timeout_url')
55 |         #     with codecs.open(timeout_filename, mode='wb') as wf:
56 |         #         timeout_url_list = [item+'\n' for item in timeout_url_list]
57 |         #         wf.writelines(timeout_url_list)
58 | 
59 | if __name__ == "__main__":
60 |     freq_crawler = CrawlFreq()
61 |     freq_crawler.read_item_url()
62 | 
63 | 


--------------------------------------------------------------------------------
/jd/book_jd/book_name.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import random
 7 | import codecs
 8 | import urllib2
 9 | import xici_proxy
10 | from bs4 import BeautifulSoup
11 | 
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 | def parse_topic_url():
14 |     book_url = "http://book.jd.com/booksort.html"
15 |     response = urllib2.urlopen(book_url)
16 |     html = response.read()
17 |     soup = BeautifulSoup(html)
18 |     div_level_str = soup.find('div', id='booksort')
19 |     em_level_list = div_level_str.find_all('em')
20 |     topic_url_list = [item.a['href'] for item in em_level_list]
21 |     print topic_url_list[32:-2],len(set(topic_url_list))
22 |     return topic_url_list[32:-2]
23 | # parse_topic_url()
24 | def gen_whole_page_url():
25 |     topic_url_list = parse_topic_url()
26 |     book_page_url_filename = os.path.join(PATH, 'sys', 'book_name_whole_page_url')
27 |     redirect_url_filename = os.path.join(PATH, 'log', 'redirect_page_url')
28 |     timeout_url_filename = os.path.join(PATH, 'log', 'timeout_topic_url')
29 |     with codecs.open(book_page_url_filename, mode='wb', encoding='utf-8') as wf_page_url,\
30 |     codecs.open(redirect_url_filename, mode='wb', encoding='utf-8') as wf_redirect_url,\
31 |     codecs.open(timeout_url_filename, mode='wb', encoding='utf-8')as wf_timeout_url:
32 |         count = 0
33 |         for topic_url in topic_url_list:
34 |             count += 1
35 |             page_url_list = []
36 |             try:
37 |                 response = urllib2.urlopen(topic_url, timeout=10)
38 |                 if response.geturl() != topic_url:
39 |                     print 'redirect page hrer in url:%s'%topic_url
40 |             except:
41 |                 wf_timeout_url.write(topic_url+'\n')
42 |                 print 'timed out item_id in url:%s'%topic_url
43 |                 continue
44 |             html = response.read()
45 |             soup = BeautifulSoup(html)
46 |             max_page_str = soup.find('div', class_='pagin pagin-m')
47 |             if not max_page_str:
48 |                 print 'max_page_str do not match regular expression in url:%s'%topic_url
49 |                 continue
50 |             page_size = max_page_str.span.text.split('/')[-1]
51 |             print count, page_size
52 |             end_url_pattern = '?s=15&t=1&p=%s'
53 |             for page_num in range(1, int(page_size)+1):
54 |                 url = ''.join((topic_url,end_url_pattern%page_num))
55 |                 page_url_list.append(url+'\n')
56 |             wf_page_url.writelines(page_url_list)
57 | # gen_whole_page_url()
58 | def read_topic_page_url_to_get_pagesize():
59 |         page_url = 'http://list.jd.com/1713-3265-3429.html'
60 |         # page_url = "http://list.jd.com/1713-3267-3456.html"
61 |         html = urllib2.urlopen(page_url).read()
62 |         soup = BeautifulSoup(html)
63 |         div_level_str = soup.find('div', id='plist')
64 |         # print div_level_str
65 |         a_level_list = div_level_str.find_all('a', href=re.compile('http://item'), class_=None, title=True, target='_blank')
66 |         # print a_level_list, len(a_level_list)
67 |         item_url_list = set([item['href'] for item in a_level_list])
68 |         # print item_url_list,len(item_url_list)
69 |         max_page_str = soup.find('div', class_='pagin pagin-m')
70 |         if not max_page_str:
71 |             print 'max_page_str is null'
72 |             return
73 |         page_size = max_page_str.span.text.split('/')[-1]
74 |         print page_size
75 |         end_url_pattern = '?s=15&t=1&p=%s'
76 |         for page_num in range(1, int(page_size)+1):
77 |             url = ''.join((page_url,end_url_pattern%page_num))
78 |             print url
79 | # read_topic_page_url_to_get_pagesize()


--------------------------------------------------------------------------------
/jd/book_jd/xici_proxy.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import codecs
 7 | import urllib2
 8 | from bs4 import BeautifulSoup
 9 | 
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | def Varify_proxy(ip_port):
13 |     '''被封的IP'''
14 |     # ip_port = "58.20.127.178:3128"
15 |     url = 'http://list.jd.com/1713-3260-3338-0-0-0-0-0-0-0-1-1-2.html'
16 |     proxy_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
17 |     opener = urllib2.build_opener(proxy_hanlder)
18 |     urllib2.install_opener(opener)
19 |     try:
20 |         resopnse = urllib2.urlopen(url, timeout=8).read().decode('gbk')
21 |         # print resopnse.read().decode('gbk')
22 |     except urllib2.URLError, e:
23 |         if e.reason:
24 |             print e.reason
25 |             return
26 |         else:
27 |             return
28 |     except:
29 |         print 'timed item_id...in ip_port:%s'%ip_port
30 |         return
31 | 
32 |     soup = BeautifulSoup(resopnse)
33 |     div_level_str = soup.find('div', id='plist')
34 |     # print div_level_str
35 |     div_item_list = div_level_str.find_all('div', class_='item')
36 |     item_id_list = [item['sku'] for item in div_item_list]
37 | 
38 |     if item_id_list:
39 |         print 'success ip_port:%s'%ip_port
40 |         return True
41 |     else:
42 |         return
43 | 
44 | def write_proxy_into_file(http_proxy_list):
45 |     '''过滤已被查封的IP,并把有效代理写入到本地xici_proxy文件中'''
46 |     com_str_list = [item+'\n' for item in http_proxy_list if Varify_proxy(item)]
47 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
48 |     with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
49 |         wf.writelines(com_str_list)
50 | 
51 | def check_exists_proxy_file():
52 |     '''过滤当前代理文件xici_proxy中有效proxy, 并写入到本地文件'''
53 |     proxy_filename = os.path.join(PATH, 'sys', 'xici_proxy')
54 |     new_proxy_filename = os.path.join(PATH, 'sys', 'xici_proxy')
55 |     with codecs.open(proxy_filename, encoding='utf-8') as proxy_filename_f:
56 |         new_proxy_list = [item for item in proxy_filename_f.readlines() if Varify_proxy(item)]
57 |     with codecs.open(new_proxy_filename, mode='wb', encoding='utf-8') as new_proxy_filename_wf:
58 |         new_proxy_filename_wf.writelines(new_proxy_list)
59 |         print len(new_proxy_list)
60 | 
61 | def get_valid_proxy(proxy_list):
62 |     '''返回有效的代理'''
63 |     ip_port_list = [item for item in proxy_list if Varify_proxy(item)]
64 |     return ip_port_list
65 | 
66 | def gen_proxy():
67 |     url_pattern = "http://www.xici.net.co/nn/%s"
68 |     url_list = [url_pattern%str(i) for i in range(1,4)]
69 | 
70 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
71 |     with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
72 |         for url in url_list:
73 |             try:
74 |                 html = urllib2.urlopen(url, timeout=15).read()
75 |             except:
76 |                 time.sleep(60)
77 |                 try:
78 |                     html = urllib2.urlopen(url, timeout=15).read()
79 |                 except:
80 |                     continue
81 |             soup = BeautifulSoup(html)
82 |             tr_level_list = soup.find_all('tr')
83 |             td_level_list = [item.find_all('td') for item in tr_level_list]
84 |             ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
85 |             matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
86 |             com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
87 |             temp_proxy_list = [item+'\n' for item in com_str_list if Varify_proxy(item)]
88 |             wf.writelines(temp_proxy_list)
89 | 
90 | if __name__ == "__main__":
91 |     # gen_proxy()
92 |     check_exists_proxy_file()


--------------------------------------------------------------------------------
/baidu_word_freq/tst_freq.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'huafeng'
  2 | #coding:utf-8
  3 | import urllib2
  4 | import os
  5 | import re
  6 | import codecs
  7 | import time
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | PATH = os.path.dirname(os.path.abspath(__file__))
 11 | class CrawlFreq:
 12 | 
 13 |     def __init__(self):
 14 |         self.words_list = []
 15 |         self._gen_words_list()
 16 | 
 17 |     def _gen_words_list(self):
 18 |         filename = os.path.join(PATH, 'sys', 'wrod_without_freq.txt')
 19 |         with codecs.open(filename) as f:
 20 |             temp_word_list = [item.strip() for item in f.readlines()]
 21 |             self.words_list.extend(temp_word_list)
 22 | 
 23 |     def read_item_url(self):
 24 |         timeout_url_list = []
 25 |         url_pattern = 'http://www.baidu.com/s?wd="%s"&fr=wenku'
 26 |         num_pattern = re.compile(r'\d+')
 27 |         filename = os.path.join(PATH, 'out', 'words_freq_0507')
 28 |         with codecs.open(filename, mode='wb')as f:
 29 |             for word in self.words_list:
 30 |                 url = url_pattern%word
 31 |                 try:
 32 |                     html = urllib2.urlopen(url, timeout=15).read()
 33 |                 except:
 34 |                     try:
 35 |                         html = urllib2.urlopen(url, timeout=15).read()
 36 |                     except:
 37 |                         timeout_url_list.append(url)
 38 |                         continue
 39 |                 soup = BeautifulSoup(html)
 40 |                 span_level_str_list = soup.find_all('span', class_='nums')
 41 |                 if span_level_str_list:
 42 |                     span_level_str = span_level_str_list[0]
 43 |                     num_text = span_level_str.get_text()
 44 |                     num = "".join(num_pattern.findall(num_text))
 45 |                     com_str = "\t".join((word, str(num))) + '\n'
 46 |                 else:
 47 |                     com_str = "\t".join((word, "0")) + '\n'
 48 |                 f.write(com_str)
 49 |                 time.sleep(2)
 50 | 
 51 |         if timeout_url_list:
 52 |             timeout_filename = os.path.join(PATH, 'timeout_url')
 53 |             with codecs.open(timeout_filename, mode='wb') as wf:
 54 |                 timeout_url_list = [item+'\n' for item in timeout_url_list]
 55 |                 wf.writelines(timeout_url_list)
 56 | # freq_crawler = CrawlFreq()
 57 | # freq_crawler.read_item_url()
 58 | 
 59 | def use_num_pattern():
 60 |     s = "百度为您找到相关结果约8,080,000个"
 61 |     num_list = re.findall(r'\d+',s)
 62 |     num = "".join(num_list)
 63 |     print num
 64 | def read_unformal_page():
 65 |     url = 'http://www.baidu.com/s?wd="一字一键章鱼输入"&fr=wenku'
 66 |     html = urllib2.urlopen(url).read()
 67 |     soup = BeautifulSoup(html)
 68 |     span_level_str_list = soup.find_all('span', class_='nums')
 69 |     print span_level_str_list
 70 | # read_unformal_page()
 71 | def get_omit_word():
 72 |     source_filename = os.path.join(PATH, 'sys', 'word_without_freq_0528.txt')
 73 |     with codecs.open(source_filename) as f:
 74 |         word_set = set([item.strip() for item in f.readlines()])
 75 |         # for word in word_set:
 76 |         #     print type(word)
 77 | 
 78 |     des_filename = os.path.join(PATH, 'out', 'words_freq_0528')
 79 |     with codecs.open(des_filename) as rf:
 80 |         word_list = set([item.split('\t')[0] for item in rf.readlines()])
 81 | 
 82 |     # print len(word_list), len(word_set)
 83 |     # param = word_set-word_list
 84 |     # print list(param)[0].decode('gbk')
 85 | 
 86 |     for word in word_set:
 87 |         if word not in word_list:
 88 |             print word.decode('gbk')
 89 | # get_omit_word()
 90 | def check_zero_num():
 91 |     filename = os.path.join(PATH, 'out', 'words_freq_0528')
 92 |     with open(filename) as f:
 93 |         zero_freq_list = []
 94 |         for line in f.readlines():
 95 |             splited_line = line.split('\t')
 96 |             word = splited_line[0]
 97 |             freq = splited_line[1].strip()
 98 |             if freq == '0':
 99 |                 print line.decode('gbk')
100 |                 zero_freq_list.append(word.decode('gbk'))
101 |         print zero_freq_list, len(zero_freq_list)
102 | # check_zero_num()


--------------------------------------------------------------------------------
/jd/electronic_jd/log/elec_failed_url:
--------------------------------------------------------------------------------
 1 | div do not match pattern in url;http://item.jd.com/1125519418.html
 2 | div do not match pattern in url;http://item.jd.com/1096499132.html
 3 | div do not match pattern in url;http://item.jd.com/1014261178.html
 4 | div do not match pattern in url;http://item.jd.com/1052779534.html
 5 | div do not match pattern in url;http://item.jd.com/1051365121.html
 6 | div do not match pattern in url;http://item.jd.com/1052636010.html
 7 | div do not match pattern in url;http://item.jd.com/1052714871.html
 8 | div do not match pattern in url;http://item.jd.com/1052747289.html
 9 | div do not match pattern in url;http://item.jd.com/1052785449.html
10 | div do not match pattern in url;http://item.jd.com/1053007633.html
11 | div do not match pattern in url;http://item.jd.com/1053011719.html
12 | div do not match pattern in url;http://item.jd.com/1053021423.html
13 | div do not match pattern in url;http://item.jd.com/1053050579.html
14 | div do not match pattern in url;http://item.jd.com/1056009851.html
15 | div do not match pattern in url;http://item.jd.com/1062536835.html
16 | div do not match pattern in url;http://item.jd.com/1064935912.html
17 | div do not match pattern in url;http://item.jd.com/1066380089.html
18 | div do not match pattern in url;http://item.jd.com/1073344556.html
19 | div do not match pattern in url;http://item.jd.com/1082487043.html
20 | div do not match pattern in url;http://item.jd.com/646259.html
21 | div do not match pattern in url;http://item.jd.com/968372.html
22 | div do not match pattern in url;http://item.jd.com/968371.html
23 | div do not match pattern in url;http://item.jd.com/968373.html
24 | div do not match pattern in url;http://item.jd.com/1049616669.html
25 | div do not match pattern in url;http://item.jd.com/1049604169.html
26 | div do not match pattern in url;http://item.jd.com/1049713716.html
27 | div do not match pattern in url;http://item.jd.com/1049901825.html
28 | div do not match pattern in url;http://item.jd.com/1049864281.html
29 | div do not match pattern in url;http://item.jd.com/1049998153.html
30 | div do not match pattern in url;http://item.jd.com/1067645985.html
31 | div do not match pattern in url;http://item.jd.com/1067645986.html
32 | div do not match pattern in url;http://item.jd.com/1025080.html
33 | div do not match pattern in url;http://item.jd.com/1000297153.html
34 | div do not match pattern in url;http://item.jd.com/1010819419.html
35 | div do not match pattern in url;http://item.jd.com/1132494073.html
36 | div do not match pattern in url;http://item.jd.com/600316.html
37 | div do not match pattern in url;http://item.jd.com/600318.html
38 | div do not match pattern in url;http://item.jd.com/854174.html
39 | div do not match pattern in url;http://item.jd.com/1065790112.html
40 | div do not match pattern in url;http://item.jd.com/1068210793.html
41 | div do not match pattern in url;http://item.jd.com/1068210794.html
42 | div do not match pattern in url;http://item.jd.com/1068210795.html
43 | div do not match pattern in url;http://item.jd.com/1010471.html
44 | div do not match pattern in url;http://item.jd.com/271696.html
45 | div do not match pattern in url;http://item.jd.com/481325.html
46 | div do not match pattern in url;http://item.jd.com/1003116056.html
47 | div do not match pattern in url;http://item.jd.com/1041219087.html
48 | div do not match pattern in url;http://item.jd.com/1062607890.html
49 | div do not match pattern in url;http://item.jd.com/1091726881.html
50 | div do not match pattern in url;http://item.jd.com/1091729610.html
51 | div do not match pattern in url;http://item.jd.com/1031478255.html
52 | div do not match pattern in url;http://item.jd.com/1052374448.html
53 | div do not match pattern in url;http://item.jd.com/1059701191.html
54 | div do not match pattern in url;http://item.jd.com/904326.html
55 | div do not match pattern in url;http://item.jd.com/848334.html
56 | div do not match pattern in url;http://item.jd.com/518467.html
57 | div do not match pattern in url;http://item.jd.com/904327.html
58 | div do not match pattern in url;http://item.jd.com/1031373.html
59 | div do not match pattern in url;http://item.jd.com/925068.html
60 | div do not match pattern in url;http://item.jd.com/842360.html
61 | div do not match pattern in url;http://item.jd.com/988689.html
62 | div do not match pattern in url;http://item.jd.com/881212.html
63 | div do not match pattern in url;http://item.jd.com/881190.html
64 | 


--------------------------------------------------------------------------------
/baidu/out/50:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/50.htm", "header": "\u955c\u50cf\u7ad9\u70b9", "content": "\u955c\u50cf\uff0c\u539f\u610f\u662f\u5149\u5b66\u91cc\u6307\u7684\u7269\u4f53\u5728\u955c\u9762\u4e2d\u6240\u6210\u4e4b\u50cf\u3002\u5f15\u7528\u5230\u8ba1\u7b97\u673a\u7f51\u7edc\u4e0a\uff0c\u4e00\u4e2a\u955c\u50cf\u7ad9\u70b9\uff08\u6216\u79f0\u955c\u50cf\uff09\u662f\u6307\u53e6\u4e00\u4e2a\u7ad9\u70b9\u5185\u5bb9\u7684\u62f7\u8d1d\u3002\u955c\u50cf\u901a\u5e38\u7528\u4e8e\u4e3a\u76f8\u540c\u4fe1\u606f\u5185\u5bb9\u63d0\u4f9b\u4e0d\u540c\u7684\u6e90\uff0c\u7279\u522b\u662f\u5728\u4e0b\u8f7d\u91cf\u5927\u7684\u65f6\u5019\u63d0\u4f9b\u4e86\u4e00\u79cd\u53ef\u9760\u7684\u7f51\u7edc\u8fde\u63a5\u3002\u5236\u4f5c\u955c\u50cf\u662f\u4e00\u79cd\u6587\u4ef6\u540c\u6b65\u7684\u8fc7\u7a0b\u3002\u521b\u5efa\u955c\u50cf\u7684\u76ee\u7684\u901a\u5e38\u6709\u4ee5\u4e0b\u51e0\u4e2a\uff1a\u4fdd\u5b58\u7f51\u9875\u4fe1\u606f\uff0c\u7279\u522b\u662f\u5728\u4e00\u4e2a\u7f51\u7ad9\u9762\u4e34\u5173\u7ad9\u7684\u65f6\u5019\u3002\u63d0\u9ad8\u7528\u6237\u5728\u67d0\u4e2a\u5730\u533a\u7684\u4e0b\u8f7d\u901f\u5ea6\u3002\u8b6c\u5982\u4e00\u4e2a\u7f8e\u56fd\u7f51\u7ad9\u7684\u4e2d\u56fd\u955c\u50cf\u53ef\u4ee5\u4f7f\u6765\u81ea\u4e2d\u56fd\u7684\u7528\u6237\u76f4\u63a5\u4ece\u8fd9\u4e2a\u4e2d\u56fd\u7684\u955c\u50cf\u8bbf\u95ee\uff0c\u4ece\u800c\u52a0\u5feb\u4e86\u901f\u5ea6\u3002\u8fd9\u53ef\u4ee5\u770b\u4f5c\u662f\u4e00\u79cd\u5168\u7403\u8303\u56f4\u7684\u7f13\u5b58\u3002\u5bf9\u4e0d\u53ef\u7528\u7684\u4fe1\u606f\u63d0\u4f9b\u8fde\u63a5\u9014\u5f84\u3002\u4f8b\u5982\uff0c2002\u5e74\u7684\u65f6\u5019\u4e2d\u56fd\u5bf9Google\u5c01\u9501\u7684\u65f6\u5019\uff0c\u955c\u50cf\u7ad9\u70b9google\u6210\u4e3a\u4e86\u6709\u6548\u7684\u7ed5\u8fc7\u5c01\u9501\u7684\u9014\u5f84\u3002\u4fdd\u5b58\u5386\u53f2\u6027\u7684\u6570\u636e\u3001\u4fe1\u606f\uff0c\u9632\u6b62\u4e22\u5931\u3002\u5e73\u8861\u7f51\u7ad9\u7684\u6d41\u91cf\u8d1f\u8f7d\u3002\u4f8b\u5982\uff0c\u4e00\u4e2aLinux\u7684\u53d1\u884c\u7248\u7684ISO\u955c\u50cf\u6587\u4ef6\u7684\u5927\u91cf\u4e0b\u8f7d\u53ef\u80fd\u4f1a\u5bfc\u81f4\u4e3b\u7ad9\u8fc7\u8377\u800c\u4e0b\u7ebf\uff0c\u800c\u955c\u50cf\u7ad9\u70b9\u5219\u53ef\u4ee5\u5728\u591a\u4e2a\u670d\u52a1\u5668\u5206\u62c5\u6d41\u91cf\uff0c\u4ece\u800c\u4fdd\u8bc1\u7ad9\u70b9\u7684\u6301\u7eed\u5728\u7ebf\u3002\u5bf9\u56e0\u6d41\u91cf\u6fc0\u589e\u800c\u4e0b\u7ebf\u7684\u7ad9\u70b9\u63d0\u4f9b\u4e34\u65f6\u7684\u8bbf\u95ee\u3002\u4fbf\u4e8e\u4e0d\u540c\u5730\u533a\u7684\u4eba\u83b7\u5f97\u66f4\u5feb\u7684\u8bbf\u95ee\u901f\u5ea6\u3002\u907f\u5f00\u5bf9\u4e8e\u4e3b\u7f51\u7ad9\u7684\u5ba1\u67e5\u548c\u5c4f\u853d\u3002\u521b\u5efa\u955c\u50cf\u7ad9\u70b9\u901a\u5e38\u662f\u7531\u4e8e\u7f51\u7edc\u5e26\u5bbd\u9650\u5236\u3001\u5c01\u9501\u6216\u662f\u5176\u4ed6\u539f\u56e0\uff0c\u5bfc\u81f4\u65e0\u6cd5\u5b9e\u73b0\u5bf9\u4e3b\u7ad9\u70b9\u7684\u6b63\u5e38\u8bbf\u95ee\u3002\u8fd9\u65f6\u901a\u8fc7\u5c06\u4e3b\u7ad9\u70b9\u7684\u4fe1\u606f\u8d44\u6e90\u79fb\u690d\u8f6c\u79fb\u5230\u76f8\u5bf9\u5bb9\u6613\u8bbf\u95ee\u7684\u672c\u5730\u670d\u52a1\u5668\uff0c\u4ee5\u63d0\u9ad8\u7528\u6237\u7684\u8bbf\u95ee\u6548\u7387\u3002\u5e38\u89c1\u4e8e\u5185\u90e8\u7f51\u3001\u6821\u56ed\u7f51\u6216\u662f\u5176\u4ed6\u6709\u8f83\u5927\u5b58\u50a8\u8bbe\u5907\u7684\u670d\u52a1\u5668\u3002\u8f83\u5c0f\u7684\u7f51\u7ad9\u6709\u65f6\u4e5f\u4ee5\u955c\u50cf\u6765\u5e94\u5bf9\u5c01\u9501\u3001\u5c4f\u853d\uff08\u53c2\u89c1\u9632\u706b\u957f\u57ce\uff09\u3002\u901a\u8fc7\u955c\u50cf\u7ad9\u70b9\uff0c\u4e5f\u53ef\u4ee5\u5b9e\u73b0\u591a\u76ee\u6807\u591a\u7ebf\u7a0b\u7684\u5feb\u901f\u4e0b\u8f7d\u3002\u4f8b\u5982GetSmart2\u548cGetRight\u6b63\u662f\u4f7f\u7528\u4e86\u8fd9\u79cd\u9ad8\u7ea7\u7684\u591a\u7ebf\u7a0b\u4e0b\u8f7d\u6280\u672f\u3002\u00a0\u00a0\u955c\u50cf\u7ad9\u70b9\u955c\u50cf\u7ad9\u70b9\u662f\u901a\u8fc7\u4e3b\u670d\u52a1\u5668\u589e\u52a0\u8f6c\u79fb\u5b58\u50a8\u5730\u5740\u6765\u5b9e\u73b0\u4fe1\u606f\u7684\u5f02\u5730\u5907\u4efd\u3002\u901a\u5e38\u4e00\u4e2a\u955c\u50cf\u4f1a\u5b9a\u671f\u8bbf\u95ee\u4e3b\u7f51\u7ad9\uff0c\u4ee5\u66f4\u65b0\u5176\u5185\u5bb9\u3002\u955c\u50cf\u4e5f\u6709\u5206\u4e00\u7ea7\u3001\u4e8c\u7ea7\u7b49\u7b49\u3002\u4e8c\u7ea7\u955c\u50cf\u662f\u6307\u90a3\u4e9b\u901a\u8fc7\u8bbf\u95ee\u4e00\u7ea7\u955c\u50cf\u7f51\u7ad9\u6765\u66f4\u65b0\u5185\u5bb9\u7684\u7f51\u7ad9\uff0c\u901a\u5e38\u66f4\u65b0\u901f\u5ea6\u4e0d\u5982\u4e00\u7ea7\u955c\u50cf\uff0c\u4f46\u4e0d\u4f1a\u7ed9\u4e3b\u7f51\u7ad9\u589e\u52a0\u8d1f\u62c5\u3002\u00a0\u00a0\u955c\u50cf\u7ad9\u70b9\u66f4\u65b0\u955c\u50cf\u6700\u5e38\u7528\u7684\u8f6f\u4ef6\u662frsync\uff0c\u6709\u65f6\u5019\u4e5f\u76f4\u63a5\u7528http\u955c\u50cf\u5de5\u5177\u3002"}


--------------------------------------------------------------------------------
/douban/movie_actors.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'huafeng'
  2 | 
  3 | import os
  4 | import re
  5 | import time
  6 | import codecs
  7 | import logging
  8 | import urllib2
  9 | import random
 10 | import gevent
 11 | import gevent.monkey
 12 | from math import ceil
 13 | from bs4 import BeautifulSoup
 14 | gevent.monkey.patch_all()
 15 | 
 16 | PATH = os.path.dirname(os.path.abspath(__file__))
 17 | SLEEP_INTERVAL = random.randint(2,5)
 18 | 
 19 | class MovieActor:
 20 |     def __init__(self):
 21 |         self.proxy_list = []
 22 |         self._gen_proxy()
 23 |         self.actor_content_text_list = []
 24 |         self.timeout_url_list = []
 25 |     
 26 |     def _gen_proxy(self):
 27 |         filename = os.path.join(PATH, 'sys', 'xici_proxy')
 28 |         with codecs.open(filename, encoding='utf-8') as f:
 29 |             self.proxy_list.extend([item.strip() for item in f.readlines()])
 30 | 
 31 |     def parse_actor_content_url(self, url, ip_port):
 32 |         # http_proxy = 'http://%s'%ip_port
 33 |         # proxy_hanlder = urllib2.HTTPHandler({'http':http_proxy})
 34 |         # opener = urllib2.build_opener(proxy_hanlder)
 35 |         # urllib2.install_opener(opener)
 36 | 
 37 |         try:
 38 |             html = urllib2.urlopen(url, timeout=15).read()
 39 |         except:
 40 |             try:
 41 |                 html = urllib2.urlopen(url, timeout=15).read()
 42 |             except:
 43 |                 self.timeout_url_list.append(url)
 44 |                 return
 45 |         soup = BeautifulSoup(html)
 46 |         div_level_str = soup.find('div', id='content')
 47 |         if not div_level_str:
 48 |             return
 49 |         actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden')
 50 |         actor_content_text = actor_summary.text.strip()
 51 |         if actor_content_text:
 52 |             self.actor_content_text_list.append(actor_content_text)
 53 | 
 54 |     def write_content_into_file(self):
 55 |         timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt')
 56 |         filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp)
 57 |         actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list]
 58 |         with codecs.open(filename, mode='a', encoding='utf-8') as wf:
 59 |             wf.writelines(actor_content_text_list)
 60 | 
 61 |     def write_timeout_url(self):
 62 |         filename = os.path.join(PATH, 'log', 'actor_info_timeout_url')
 63 |         timeout_url_list = [item+'\n' for item in self.timeout_url_list]
 64 |         with codecs.open(filename, mode='a', encoding='utf-8') as f:
 65 |             f.writelines(timeout_url_list)
 66 | 
 67 |     def main(self):
 68 |         proxy_count = len(self.proxy_list)
 69 |         thread_count = 60
 70 |         range_from = 1000000
 71 |         range_to = 1339959
 72 |         url_range_start = range_from/thread_count
 73 |         url_range_stop = range_to/thread_count
 74 | 
 75 |         threads_per_proxy = int(ceil(thread_count/float(proxy_count)))
 76 |         threads = []
 77 |         url_pattern = 'http://movie.douban.com/celebrity/%s/'
 78 |         for post_time_count in range(url_range_start, url_range_stop):
 79 |             for url_point in range(post_time_count*thread_count, (post_time_count+1)*thread_count):
 80 |                 url = url_pattern%str(url_point)
 81 |                 proxy_point = (url_point - post_time_count*thread_count)/threads_per_proxy
 82 |                 ip_port = self.proxy_list[proxy_point]
 83 |                 threads.append(gevent.spawn(self.parse_actor_content_url, url, ip_port))
 84 |             gevent.joinall(threads)
 85 |             self.write_content_into_file()
 86 |             self.actor_content_text_list[:] = []
 87 |             if self.timeout_url_list:
 88 |                 self.write_timeout_url()
 89 |                 self.timeout_url_list[:] = []
 90 |             time.sleep(SLEEP_INTERVAL)
 91 |         left_threads = []
 92 |         url_left_count = range_to % thread_count
 93 |         for url_point in range(url_left_count):
 94 |             url = url_pattern%str(range_to-url_point)
 95 |             proxy_point = url_point/threads_per_proxy
 96 |             ip_port = self.proxy_list[proxy_point]
 97 |             left_threads.append(gevent.spawn(self.parse_actor_content_url, url, ip_port))
 98 |         gevent.joinall(left_threads)
 99 |         self.write_content_into_file()
100 |         self.actor_content_text_list[:] = []
101 | if __name__ == '__main__':
102 |     actor = MovieActor()
103 |     actor.main()


--------------------------------------------------------------------------------
/dangdang/mobile_digital/mobile/sys/mobile_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1049524121.html
 2 | http://product.dangdang.com/1091210612.html
 3 | http://product.dangdang.com/1214373005.html
 4 | http://product.dangdang.com/1055559322.html
 5 | http://product.dangdang.com/1055308007.html
 6 | http://product.dangdang.com/1004961108.html
 7 | http://product.dangdang.com/1270509408.html
 8 | http://product.dangdang.com/1070416207.html
 9 | http://product.dangdang.com/1011294508.html
10 | http://product.dangdang.com/1099246612.html
11 | http://product.dangdang.com/1222979005.html
12 | http://product.dangdang.com/1005093405.html
13 | http://product.dangdang.com/1006944708.html
14 | http://product.dangdang.com/400724832.html
15 | http://product.dangdang.com/1118130212.html
16 | http://product.dangdang.com/60321917.html
17 | http://product.dangdang.com/1379982106.html
18 | http://product.dangdang.com/400939011.html
19 | http://product.dangdang.com/400630754.html
20 | http://product.dangdang.com/1280568908.html
21 | http://product.dangdang.com/1146304608.html
22 | http://product.dangdang.com/1095921111.html
23 | http://product.dangdang.com/1055569222.html
24 | http://product.dangdang.com/1395168506.html
25 | http://product.dangdang.com/1009467807.html
26 | http://product.dangdang.com/1022694312.html
27 | http://product.dangdang.com/1115351422.html
28 | http://product.dangdang.com/1436105208.html
29 | http://product.dangdang.com/1023120306.html
30 | http://product.dangdang.com/1270633008.html
31 | http://product.dangdang.com/1121937507.html
32 | http://product.dangdang.com/60320918.html
33 | http://product.dangdang.com/1299690605.html
34 | http://product.dangdang.com/1034395805.html
35 | http://product.dangdang.com/1224226022.html
36 | http://product.dangdang.com/60556058.html
37 | http://product.dangdang.com/1057377505.html
38 | http://product.dangdang.com/400635387.html
39 | http://product.dangdang.com/1036615308.html
40 | http://product.dangdang.com/60557518.html
41 | http://product.dangdang.com/1005324107.html
42 | http://product.dangdang.com/1019822907.html
43 | http://product.dangdang.com/1264778205.html
44 | http://product.dangdang.com/1135040012.html
45 | http://product.dangdang.com/60548424.html
46 | http://product.dangdang.com/400639260.html
47 | http://product.dangdang.com/1226466722.html
48 | http://product.dangdang.com/1047734008.html
49 | http://product.dangdang.com/1143364612.html
50 | http://product.dangdang.com/1023101006.html
51 | http://product.dangdang.com/1141474912.html
52 | http://product.dangdang.com/1102204721.html
53 | http://product.dangdang.com/1124284321.html
54 | http://product.dangdang.com/1444889401.html
55 | http://product.dangdang.com/1224105622.html
56 | http://product.dangdang.com/1216049505.html
57 | http://product.dangdang.com/1224198302.html
58 | http://product.dangdang.com/1229077905.html
59 | http://product.dangdang.com/1003481108.html
60 | http://product.dangdang.com/1227067722.html
61 | http://product.dangdang.com/1227051822.html
62 | http://product.dangdang.com/60327476.html
63 | http://product.dangdang.com/1009859702.html
64 | http://product.dangdang.com/1014556112.html
65 | http://product.dangdang.com/1270563208.html
66 | http://product.dangdang.com/1010554222.html
67 | http://product.dangdang.com/1073193122.html
68 | http://product.dangdang.com/1474689501.html
69 | http://product.dangdang.com/1227084022.html
70 | http://product.dangdang.com/1020516307.html
71 | http://product.dangdang.com/60556072.html
72 | http://product.dangdang.com/1008639908.html
73 | http://product.dangdang.com/1244660206.html
74 | http://product.dangdang.com/1039487908.html
75 | http://product.dangdang.com/1084207222.html
76 | http://product.dangdang.com/1167984407.html
77 | http://product.dangdang.com/1134409012.html
78 | http://product.dangdang.com/1044489722.html
79 | http://product.dangdang.com/1211221011.html
80 | http://product.dangdang.com/1065724312.html
81 | http://product.dangdang.com/1140880905.html
82 | http://product.dangdang.com/1024874906.html
83 | http://product.dangdang.com/1024955006.html
84 | http://product.dangdang.com/1089685721.html
85 | http://product.dangdang.com/1025820307.html
86 | http://product.dangdang.com/1009458508.html
87 | http://product.dangdang.com/1009074807.html
88 | http://product.dangdang.com/1101452422.html
89 | http://product.dangdang.com/1384422106.html
90 | http://product.dangdang.com/1187447108.html
91 | http://product.dangdang.com/1059197007.html
92 | http://product.dangdang.com/60320920.html
93 | http://product.dangdang.com/1238420705.html
94 | http://product.dangdang.com/1185192311.html
95 | http://product.dangdang.com/1212772205.html
96 | http://product.dangdang.com/1208068222.html
97 | 


--------------------------------------------------------------------------------
/dangdang/appliance/sys/appliance_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1280649708.html
 2 | http://product.dangdang.com/1105676205.html
 3 | http://product.dangdang.com/1203684808.html
 4 | http://product.dangdang.com/1446526808.html
 5 | http://product.dangdang.com/1254945508.html
 6 | http://product.dangdang.com/1226355406.html
 7 | http://product.dangdang.com/1203655708.html
 8 | http://product.dangdang.com/1211710508.html
 9 | http://product.dangdang.com/1214771306.html
10 | http://product.dangdang.com/1203673808.html
11 | http://product.dangdang.com/1274290606.html
12 | http://product.dangdang.com/1261458801.html
13 | http://product.dangdang.com/1214794906.html
14 | http://product.dangdang.com/1215095806.html
15 | http://product.dangdang.com/1317165807.html
16 | http://product.dangdang.com/1211700908.html
17 | http://product.dangdang.com/1286296907.html
18 | http://product.dangdang.com/1203325207.html
19 | http://product.dangdang.com/1058890906.html
20 | http://product.dangdang.com/1292496705.html
21 | http://product.dangdang.com/1197298806.html
22 | http://product.dangdang.com/1198256106.html
23 | http://product.dangdang.com/1203295907.html
24 | http://product.dangdang.com/1301703507.html
25 | http://product.dangdang.com/1120774705.html
26 | http://product.dangdang.com/1221631706.html
27 | http://product.dangdang.com/400966486.html
28 | http://product.dangdang.com/1126679905.html
29 | http://product.dangdang.com/1211674908.html
30 | http://product.dangdang.com/1211712308.html
31 | http://product.dangdang.com/1215095606.html
32 | http://product.dangdang.com/1027167006.html
33 | http://product.dangdang.com/1283333507.html
34 | http://product.dangdang.com/1032008522.html
35 | http://product.dangdang.com/1283283707.html
36 | http://product.dangdang.com/400967439.html
37 | http://product.dangdang.com/1216709307.html
38 | http://product.dangdang.com/1058922906.html
39 | http://product.dangdang.com/1274303306.html
40 | http://product.dangdang.com/1414981308.html
41 | http://product.dangdang.com/400966577.html
42 | http://product.dangdang.com/1058912006.html
43 | http://product.dangdang.com/1283283607.html
44 | http://product.dangdang.com/400921188.html
45 | http://product.dangdang.com/1027136606.html
46 | http://product.dangdang.com/1283309707.html
47 | http://product.dangdang.com/1064502405.html
48 | http://product.dangdang.com/1428125408.html
49 | http://product.dangdang.com/1203734508.html
50 | http://product.dangdang.com/1175929607.html
51 | http://product.dangdang.com/1282144307.html
52 | http://product.dangdang.com/1286273207.html
53 | http://product.dangdang.com/1287529107.html
54 | http://product.dangdang.com/1203702708.html
55 | http://product.dangdang.com/1203669408.html
56 | http://product.dangdang.com/1233154808.html
57 | http://product.dangdang.com/1203692108.html
58 | http://product.dangdang.com/1211666708.html
59 | http://product.dangdang.com/1211662008.html
60 | http://product.dangdang.com/1105958005.html
61 | http://product.dangdang.com/1292501105.html
62 | http://product.dangdang.com/1332343708.html
63 | http://product.dangdang.com/1215814106.html
64 | http://product.dangdang.com/1272541506.html
65 | http://product.dangdang.com/1115126205.html
66 | http://product.dangdang.com/1475195801.html
67 | http://product.dangdang.com/1261422601.html
68 | http://product.dangdang.com/1272537406.html
69 | http://product.dangdang.com/1163241806.html
70 | http://product.dangdang.com/1215813606.html
71 | http://product.dangdang.com/1042311408.html
72 | http://product.dangdang.com/1047781108.html
73 | http://product.dangdang.com/1283284307.html
74 | http://product.dangdang.com/1172570305.html
75 | http://product.dangdang.com/400963930.html
76 | http://product.dangdang.com/400966534.html
77 | http://product.dangdang.com/1105739305.html
78 | http://product.dangdang.com/1274315306.html
79 | http://product.dangdang.com/1180661806.html
80 | http://product.dangdang.com/1226359806.html
81 | http://product.dangdang.com/1064498305.html
82 | http://product.dangdang.com/1226332006.html
83 | http://product.dangdang.com/1254952608.html
84 | http://product.dangdang.com/1047362508.html
85 | http://product.dangdang.com/1198418106.html
86 | http://product.dangdang.com/1320565507.html
87 | http://product.dangdang.com/1230938505.html
88 | http://product.dangdang.com/1172568905.html
89 | http://product.dangdang.com/1317176607.html
90 | http://product.dangdang.com/1283288207.html
91 | http://product.dangdang.com/400921475.html
92 | http://product.dangdang.com/1047787708.html
93 | http://product.dangdang.com/400921293.html
94 | http://product.dangdang.com/1274483206.html
95 | http://product.dangdang.com/400921326.html
96 | http://product.dangdang.com/1203732107.html
97 | 


--------------------------------------------------------------------------------
/dangdang/decoration_health/health/sys/health_item_id:
--------------------------------------------------------------------------------
 1 | http://product.dangdang.com/1300766708.html
 2 | http://product.dangdang.com/1262439205.html
 3 | http://product.dangdang.com/1300811708.html
 4 | http://product.dangdang.com/1217809806.html
 5 | http://product.dangdang.com/1164369811.html
 6 | http://product.dangdang.com/1300774508.html
 7 | http://product.dangdang.com/1300729908.html
 8 | http://product.dangdang.com/1043063508.html
 9 | http://product.dangdang.com/1134177621.html
10 | http://product.dangdang.com/1471728301.html
11 | http://product.dangdang.com/1147670301.html
12 | http://product.dangdang.com/1043091008.html
13 | http://product.dangdang.com/1035360906.html
14 | http://product.dangdang.com/1131230621.html
15 | http://product.dangdang.com/1088187406.html
16 | http://product.dangdang.com/1332967606.html
17 | http://product.dangdang.com/1043154012.html
18 | http://product.dangdang.com/1165649811.html
19 | http://product.dangdang.com/1164370111.html
20 | http://product.dangdang.com/1300784108.html
21 | http://product.dangdang.com/1066033806.html
22 | http://product.dangdang.com/1389907102.html
23 | http://product.dangdang.com/1241614202.html
24 | http://product.dangdang.com/1132865612.html
25 | http://product.dangdang.com/1049550412.html
26 | http://product.dangdang.com/1066034506.html
27 | http://product.dangdang.com/1070724512.html
28 | http://product.dangdang.com/1262439005.html
29 | http://product.dangdang.com/1122839921.html
30 | http://product.dangdang.com/1053568112.html
31 | http://product.dangdang.com/1283924001.html
32 | http://product.dangdang.com/1090847112.html
33 | http://product.dangdang.com/1300798308.html
34 | http://product.dangdang.com/1066034406.html
35 | http://product.dangdang.com/1148181605.html
36 | http://product.dangdang.com/1132865412.html
37 | http://product.dangdang.com/1164371811.html
38 | http://product.dangdang.com/1046218812.html
39 | http://product.dangdang.com/1053104112.html
40 | http://product.dangdang.com/1016472512.html
41 | http://product.dangdang.com/1300816108.html
42 | http://product.dangdang.com/1300797008.html
43 | http://product.dangdang.com/1262439105.html
44 | http://product.dangdang.com/1245770008.html
45 | http://product.dangdang.com/1168229201.html
46 | http://product.dangdang.com/1061937522.html
47 | http://product.dangdang.com/1283974301.html
48 | http://product.dangdang.com/1142358807.html
49 | http://product.dangdang.com/1153334406.html
50 | http://product.dangdang.com/1051734212.html
51 | http://product.dangdang.com/1010583612.html
52 | http://product.dangdang.com/1164418711.html
53 | http://product.dangdang.com/1240227008.html
54 | http://product.dangdang.com/1240131308.html
55 | http://product.dangdang.com/1296490308.html
56 | http://product.dangdang.com/1296432008.html
57 | http://product.dangdang.com/1011428512.html
58 | http://product.dangdang.com/1066407808.html
59 | http://product.dangdang.com/1334221301.html
60 | http://product.dangdang.com/1363996301.html
61 | http://product.dangdang.com/1130211521.html
62 | http://product.dangdang.com/1025738621.html
63 | http://product.dangdang.com/1282620501.html
64 | http://product.dangdang.com/1091273121.html
65 | http://product.dangdang.com/1039520021.html
66 | http://product.dangdang.com/1240176408.html
67 | http://product.dangdang.com/1299184101.html
68 | http://product.dangdang.com/1299180601.html
69 | http://product.dangdang.com/1299163201.html
70 | http://product.dangdang.com/1283958201.html
71 | http://product.dangdang.com/1283940001.html
72 | http://product.dangdang.com/1283936201.html
73 | http://product.dangdang.com/1122840421.html
74 | http://product.dangdang.com/1100714521.html
75 | http://product.dangdang.com/1100714421.html
76 | http://product.dangdang.com/1094173521.html
77 | http://product.dangdang.com/1090059321.html
78 | http://product.dangdang.com/1039473621.html
79 | http://product.dangdang.com/1039459821.html
80 | http://product.dangdang.com/1240301308.html
81 | http://product.dangdang.com/1240279408.html
82 | http://product.dangdang.com/1240239108.html
83 | http://product.dangdang.com/1240210408.html
84 | http://product.dangdang.com/1039473721.html
85 | http://product.dangdang.com/1389951602.html
86 | http://product.dangdang.com/1090599812.html
87 | http://product.dangdang.com/1241580302.html
88 | http://product.dangdang.com/1240087308.html
89 | http://product.dangdang.com/1016472612.html
90 | http://product.dangdang.com/1296476408.html
91 | http://product.dangdang.com/1051661812.html
92 | http://product.dangdang.com/1381633102.html
93 | http://product.dangdang.com/1059486712.html
94 | http://product.dangdang.com/1012202912.html
95 | http://product.dangdang.com/1053065408.html
96 | http://product.dangdang.com/1053056008.html
97 | 


--------------------------------------------------------------------------------
/jd/book_jd/book_jd.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import os
 4 | import re
 5 | import time
 6 | import codecs
 7 | import urllib2
 8 | import xici_proxy
 9 | from bs4 import BeautifulSoup
10 | 
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | 
13 | def read_proxy_file():
14 |     proxy_list = []
15 |     filename = os.path.join(PATH, 'sys', 'xici_proxy')
16 |     with codecs.open(filename, encoding='utf-8')as f:
17 |         proxy_list.extend([item.strip() for item in f.readlines()])
18 |     return proxy_list
19 | def gen_whole_item_id():
20 |     timeout_timestamp = time.strftime('%m%d_timeout_page_url')
21 |     crawled_timestamp = time.strftime('%m%d_crawled_page_url')
22 |     proxy_list = read_proxy_file()
23 |     page_url_proxy_count = 0
24 |     if not proxy_list:
25 |         xici_proxy.gen_proxy()
26 |         proxy_list = read_proxy_file()
27 |     ip_port = proxy_list.pop()
28 |     handle_no_div_pattern = re.compile('no_(item|plist)_div:')
29 |     whole_page_url_filename = os.path.join(PATH, 'log', '0518_timeout_page_url')
30 |     timeout_page_url_filename = os.path.join(PATH, 'log', timeout_timestamp)
31 |     item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids')
32 |     page_url_crawled_filename = os.path.join(PATH, 'log', crawled_timestamp)
33 |     with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\
34 |     codecs.open(item_id_filename, mode='a', encoding='utf-8')as item_id_wf,\
35 |     codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\
36 |     codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf:
37 |         for page_url in [handle_no_div_pattern.sub('', item.strip()) for item in whole_page_url_f.readlines() if item.startswith('no_')]:
38 |             page_url_proxy_count += 1
39 |             try:
40 |                 if page_url_proxy_count > 2000:
41 |                     if not proxy_list:
42 |                         re_read_proxy_list = read_proxy_file()
43 |                         proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
44 |                         if not proxy_list:
45 |                             xici_proxy.gen_proxy()
46 |                             timeout_url_wf.write('get new proxy in xici network!\n')
47 |                             proxy_list = read_proxy_file()
48 |                         ip_port = proxy_list.pop()
49 |                         page_url_proxy_count = 0
50 |                 http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
51 |                 opener = urllib2.build_opener(http_hanlder)
52 |                 html = opener.open(page_url, timeout=15)
53 |             except urllib2.HTTPError, e:
54 |                 if e.getcode() == 403:
55 |                     timeout_url_wf.write('403 error:request forbiddon!!!\n')
56 |                     if not proxy_list:
57 |                         re_read_proxy_list = read_proxy_file()
58 |                         proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
59 |                         if not proxy_list:
60 |                             xici_proxy.gen_proxy()
61 |                             timeout_url_wf.write('get new proxy in xici network!\n')
62 |                             proxy_list = read_proxy_file()
63 |                         ip_port = proxy_list.pop()
64 |                     http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
65 |                     opener = urllib2.build_opener(http_hanlder)
66 |                     html = opener.open(page_url, timeout=15).read().decode('gbk')
67 |                 else:
68 |                     continue
69 |             except:
70 |                 timeout_info = ''.join(('request_timeout:', page_url, '\n'))
71 |                 timeout_url_wf.write(timeout_info)
72 |                 continue
73 |             soup = BeautifulSoup(html)
74 |             div_level_str = soup.find('div', id='plist')
75 |             if not div_level_str:
76 |                 error_match_info = ''.join(('no_plist_div:', page_url, '\n'))
77 |                 timeout_url_wf.write(error_match_info)
78 |                 continue
79 |             div_item_list = div_level_str.find_all('div', class_='item')
80 |             if not div_item_list:
81 |                 error_match_info = ''.join(('no_item_div:', page_url, '\n'))
82 |                 timeout_url_wf.write(error_match_info)
83 |                 continue
84 |             item_id_list = [item['sku']+'\n' for item in div_item_list]
85 |             item_id_wf.writelines(item_id_list)
86 |             crawled_url_wf.write(page_url+'\n')
87 |             # time.sleep(3)
88 | gen_whole_item_id()
89 | 


--------------------------------------------------------------------------------
/douban/movie_actors_single_thread.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'huafeng'
  2 | 
  3 | import os
  4 | import re
  5 | import time
  6 | import codecs
  7 | import logging
  8 | import urllib2
  9 | import random
 10 | from math import ceil
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | PATH = os.path.dirname(os.path.abspath(__file__))
 14 | SLEEP_INTERVAL = random.randint(2, 5)
 15 | 
 16 | class MovieActor:
 17 |     def __init__(self):
 18 |         self.proxy_list = []
 19 |         self._gen_proxy()
 20 |         self.actor_content_text_list = []
 21 |         self.timeout_url_list = []
 22 | 
 23 |     def _gen_log(self):
 24 |         logfile = os.path.join(PATH, 'log', 'douban_actor_cralwer.log')
 25 |         self.logger = logging.getLogger(__name__)
 26 |         self.logger.setLevel(logging.DEBUG)
 27 |         log_file = logging.FileHandler(logfile)
 28 |         log_file.setLevel(logging.DEBUG)
 29 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 30 |         log_file.setFormatter(formatter)
 31 |         self.logger.addHandler(log_file)
 32 | 
 33 |     def _gen_proxy(self):
 34 |         filename = os.path.join(PATH, 'sys', 'xici_proxy')
 35 |         with codecs.open(filename, encoding='utf-8') as f:
 36 |             self.proxy_list.extend([item.strip() for item in f.readlines()])
 37 | 
 38 |     def parse_actor_content_url(self, url):
 39 |         try:
 40 |             html = urllib2.urlopen(url, timeout=15).read()
 41 |         except:
 42 |             try:
 43 |                 html = urllib2.urlopen(url, timeout=15).read()
 44 |             except:
 45 |                 self.timeout_url_list.append(url)
 46 |                 self.logger.debug('request timeout in item_url:%s'%url)
 47 |                 return
 48 |         soup = BeautifulSoup(html)
 49 |         div_level_str = soup.find('div', id='content')
 50 |         if not div_level_str:
 51 |             self.logger.error('div_level do not match regular expression in url:%s'%url)
 52 |             return
 53 |         actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden')
 54 |         if not actor_summary:
 55 |             self.logger.debug('actor_summary do not match re in item_url:%s'%url)
 56 |         actor_content_text = actor_summary.text.strip()
 57 |         if actor_content_text:
 58 |             self.actor_content_text_list.append(actor_content_text)
 59 | 
 60 |     def write_content_into_file(self):
 61 |         timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt')
 62 |         filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp)
 63 |         actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list]
 64 |         with codecs.open(filename, mode='a', encoding='utf-8') as wf:
 65 |             wf.writelines(actor_content_text_list)
 66 | 
 67 |     def write_timeout_url(self):
 68 |         filename = os.path.join(PATH, 'log', 'actor_info_timeout_url')
 69 |         timeout_url_list = [item+'\n' for item in self.timeout_url_list]
 70 |         with codecs.open(filename, mode='a', encoding='utf-8') as f:
 71 |             f.writelines(timeout_url_list)
 72 | 
 73 |     def main(self):
 74 |         range_from = 1000000
 75 |         range_to = 1339959
 76 |         url_pattern = 'http://movie.douban.com/celebrity/%d/'
 77 |         url_list = [url_pattern%item for item in range(range_from, range_to+1)]
 78 |         # print len(url_list), url_list[-1]
 79 |         url_count = range_to - range_from
 80 |         con_count_write_in_file = 2000
 81 |         file_count = url_count/2000#338000
 82 |         for file_point in range(file_count):
 83 |             for url_point in range(file_point*con_count_write_in_file, (file_point+1)*con_count_write_in_file):
 84 |                 url = url_list[url_point]
 85 |                 self.parse_actor_content_url(url)
 86 |                 time.sleep(SLEEP_INTERVAL)
 87 |             self.write_content_into_file()
 88 |             self.actor_content_text_list[:] = []
 89 |             if self.timeout_url_list:
 90 |                 self.write_timeout_url()
 91 | 
 92 |         url_left = url_count - file_count*con_count_write_in_file#339959-338000
 93 |         for i in range(1,url_left+1):
 94 |             url = url_list[-i]
 95 |             self.parse_actor_content_url(url)
 96 |             time.sleep(SLEEP_INTERVAL)
 97 |         self.write_content_into_file()
 98 |         self.actor_content_text_list[:] = []
 99 |         if self.timeout_url_list:
100 |             self.write_timeout_url()
101 | 
102 | if __name__ == '__main__':
103 |     actor = MovieActor()
104 |     actor.main()


--------------------------------------------------------------------------------
/jd/electronic_jd/electronic_name.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'huafeng'
  2 | #coding:utf-8
  3 | import os
  4 | import re
  5 | import time
  6 | import random
  7 | import codecs
  8 | import urllib2
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | PATH = os.path.dirname(os.path.abspath(__file__))
 12 | 
 13 | def read_item_id_to_get_title():
 14 |     url_pattern = 'http://item.jd.com/%s.html'
 15 |     elec_filename = os.path.join(PATH, 'out', 'elec_name')
 16 |     failed_url = os.path.join(PATH, 'log', 'elec_failed_url')
 17 |     count = 0
 18 |     with codecs.open('./sys/electronic_item_id', encoding='utf-8') as item_id_f,\
 19 |     codecs.open(failed_url, mode='wb', encoding='utf-8') as failed_url_wf,\
 20 |     codecs.open(elec_filename, mode='a', encoding='utf-8') as con_to_write_wf:
 21 |         for item_id in [item.strip() for item in item_id_f.readlines()]:
 22 |             count += 1
 23 |             item_url = url_pattern%item_id
 24 |             try:
 25 |                 html = urllib2.urlopen(item_url).read()
 26 |             except:
 27 |                 try:
 28 |                     html = urllib2.urlopen(item_url).read()
 29 |                 except:
 30 |                     try:
 31 |                         html = urllib2.urlopen(item_url).read()
 32 |                     except:
 33 |                         print 'timed out in url;%s'%item_url
 34 |                         failed_url_wf.write('timed out in url;%s\n'%item_url)
 35 |                         continue
 36 |             soup = BeautifulSoup(html)
 37 |             try:
 38 |                 div_level_str = soup.find('div', id='name')
 39 |                 elec_title = div_level_str.text.strip()
 40 |             except:
 41 |                 print 'div do not match pattern in url;%s'%item_url
 42 |                 failed_url_wf.write('div do not match pattern in url;%s\n'%item_url)
 43 |                 continue
 44 |             con_to_write_wf.write(elec_title+'\n')
 45 |             print count
 46 | 
 47 | def read_failed_item_url():
 48 |     failed_url_filename = os.path.join(PATH, 'log', 'elec_failed_url')
 49 |     content_to_write_filename = os.path.join(PATH, 'out', 'elec_name')
 50 |     with codecs.open(failed_url_filename, encoding='utf-8') as item_id_f,\
 51 |     codecs.open(content_to_write_filename, mode='a', encoding='utf-8') as content_to_write_af:
 52 |         count = 0
 53 |         for url in [item.split(';')[1].strip() for item in item_id_f.readlines()]:
 54 |             count += 1
 55 |             try:
 56 |                 html = urllib2.urlopen(url).read()
 57 |             except:
 58 |                 try:
 59 |                     html = urllib2.urlopen(url).read()
 60 |                 except:
 61 |                     print 'timed out in url:%s'%url
 62 |                     continue
 63 | 
 64 |             soup = BeautifulSoup(html)
 65 |             try:
 66 |                 div_level_str = soup.find('div', id='name')
 67 |                 elec_title = div_level_str.text.strip()
 68 |             except:
 69 |                 print 'div do not match pattern in url;%s'%url
 70 |                 continue
 71 |             content_to_write_af.write(elec_title+'\n')
 72 |             print count
 73 | # read_failed_item_url()
 74 | 
 75 | def extract_goods_name():
 76 |     electronic_name_str_filename = os.path.join(PATH, 'out', 'elec_name')
 77 |     content_for_write_filename =os.path.join(PATH, 'out', 'electronic_name.txt')
 78 |     pattern = re.compile(ur"([\u4E00-\u9FA5]+)", re.U)
 79 |     count = 0
 80 |     with codecs.open(electronic_name_str_filename, encoding='utf-8') as f,\
 81 |     codecs.open(content_for_write_filename, mode='a', encoding='utf-8') as electronic_name_af:
 82 |         for line in f.readlines():
 83 |             count += 1
 84 |             temp_list_for_write = []
 85 |             splited_lien = pattern.split(line)
 86 |             for param in splited_lien:
 87 |                 if len(param) <= 1:
 88 |                     continue
 89 |                 if pattern.match(param):
 90 |                     temp_list_for_write.append(param+'\n')
 91 |             print count
 92 |             electronic_name_af.writelines(temp_list_for_write)
 93 | 
 94 | def chose_len_between_1_8_param():
 95 |     content_for_write_filename =os.path.join(PATH, 'out', 'electronic_name.txt')
 96 |     with codecs.open(content_for_write_filename, encoding='utf-8') as f:
 97 |         electronic_name_list = f.readlines()
 98 |         print len(electronic_name_list)
 99 |         remove_length_one_param_list = [item for item in electronic_name_list if 1<len(item.strip())<=8]
100 |         print len(remove_length_one_param_list)
101 |     with codecs.open(content_for_write_filename, mode='wb', encoding='utf-8') as wf:
102 |         wf.writelines(remove_length_one_param_list)
103 |         print len(remove_length_one_param_list)


--------------------------------------------------------------------------------
/music/baidu_music.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'huafeng'
  2 | #coding:utf-8
  3 | import os
  4 | import re
  5 | import urllib2
  6 | import codecs
  7 | import time
  8 | import random
  9 | import logging
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | PATH = os.path.dirname(os.path.abspath(__file__))
 13 | SLEEP_INTERVAL = random.randint(2, 5)
 14 | class BaiduMusic:
 15 |     def __init__(self):
 16 |         self.valid_item_url_list = []
 17 |         self.music_content_list = []
 18 |         self.crawled_url_list = []
 19 |         self._gen_log()
 20 |         self._load_crawled_file()
 21 | 
 22 |     def _load_crawled_file(self):
 23 |         filename = os.path.join(PATH, 'sys', 'crawled_item_url')
 24 |         with codecs.open(filename, encoding='utf-8') as f:
 25 |             self.crawled_url_list.extend([item.strip() for item in f.readlines()])
 26 | 
 27 |     def _gen_log(self):
 28 |         logfile = os.path.join(PATH, 'log', 'baidu_music_cralwer.log')
 29 |         self.logger = logging.getLogger(__name__)
 30 |         self.logger.setLevel(logging.DEBUG)
 31 |         log_file = logging.FileHandler(logfile)
 32 |         log_file.setLevel(logging.DEBUG)
 33 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 34 |         log_file.setFormatter(formatter)
 35 |         self.logger.addHandler(log_file)
 36 | 
 37 |     def gen_item_url(self):
 38 |         url_root = 'http://music.baidu.com/'
 39 |         url_compile_filter = re.compile(r'/song/[\d]+')
 40 | 
 41 |         page_url_list = ["http://music.baidu.com/top/dayhot","http://music.baidu.com/top/new"]
 42 |         for page_url in page_url_list:
 43 |             try:
 44 |                 html = urllib2.urlopen(page_url, timeout=15).read()
 45 |             except:
 46 |                 time.sleep(40)
 47 |                 try:
 48 |                     html = urllib2.urlopen(page_url, timeout=15).read()
 49 |                 except:
 50 |                     self.logger.debug('request timed item_id in page_url:%s'%url)
 51 |                     continue
 52 |             time.sleep(SLEEP_INTERVAL)
 53 |             soup = BeautifulSoup(html)
 54 |             span_level_list = soup.find_all('span', class_='song-title')
 55 |             if not span_level_list:
 56 |                 self.logger.error('span_level do not match regular expression in page_url:%s'%url)
 57 |             whole_url_list = [item.a['href'] for item in span_level_list]
 58 |             item_url_list = ["".join((url_root, url)) for url in whole_url_list if url_compile_filter.match(url)]
 59 |             self.valid_item_url_list.extend(item_url_list)
 60 | 
 61 |     def parse_item_url(self):
 62 |         valid_url_list = [url for url in self.valid_item_url_list if not url in self.crawled_url_list]
 63 |         for url in valid_url_list:
 64 |             try:
 65 |                 html = urllib2.urlopen(url, timeout=15).read()
 66 |             except:
 67 |                 time.sleep(20)
 68 |                 try:
 69 |                     html = urllib2.urlopen(url, timeout=15)
 70 |                 except:
 71 |                     self.logger.debug('request timed item_id in item_url:%s'%url)
 72 |                     continue
 73 |             self.crawled_url_list.append(url)
 74 |             soup = BeautifulSoup(html)
 75 |             div_level_str = soup.find('div', id='lyricCont')
 76 |             if not div_level_str:
 77 |                 self.logger.error('div_level_str do not match re in item_url:%s'%url)
 78 |                 continue
 79 |             # print div_level_str.text
 80 |             self.music_content_list.append(div_level_str.text)
 81 |             time.sleep(SLEEP_INTERVAL)
 82 | 
 83 |     def write_music_content_into_file(self):
 84 |         timestamp = time.strftime('%Y_%m_%d_%H%M%S_baidu_music.txt')
 85 |         filename = os.path.join(PATH, 'out', timestamp)
 86 |         with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
 87 |             music_conten_list = ["".join((item, '\n')) for item in self.music_content_list]
 88 |             wf.writelines(music_conten_list)
 89 | 
 90 |     def re_write_crawled_url_file(self):
 91 |         filename = os.path.join(PATH, 'sys', 'crawled_item_url')
 92 |         crawled_url_list = ["".join((url, '\n')) for url in self.crawled_url_list]
 93 |         with codecs.open(filename, 'wb', encoding='utf-8') as wf:
 94 |             wf.writelines(crawled_url_list)
 95 | 
 96 |     def main(self):
 97 |         self.gen_item_url()
 98 |         self.parse_item_url()
 99 |         self.write_music_content_into_file()
100 |         self.music_content_list[:] = []
101 |         if len(self.crawled_url_list) > 1000:
102 |             del self.crawled_url_list[:200]
103 |         self.re_write_crawled_url_file()
104 | 
105 | if __name__ == "__main__":
106 |     musicer = BaiduMusic()
107 |     musicer.main()
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/douban/sys/douban_crawled_urls:
--------------------------------------------------------------------------------
 1 | http://movie.douban.com/subject/10543682/
 2 | http://movie.douban.com/subject/24736526/
 3 | http://movie.douban.com/subject/7003297/
 4 | http://movie.douban.com/subject/10487568/
 5 | http://movie.douban.com/subject/21776863/
 6 | http://movie.douban.com/subject/19997896/
 7 | http://movie.douban.com/subject/25853104/
 8 | http://movie.douban.com/subject/21352814/
 9 | http://movie.douban.com/subject/19962587/
10 | http://movie.douban.com/subject/20284939/
11 | http://movie.douban.com/subject/25828563/
12 | http://movie.douban.com/subject/25713540/
13 | http://movie.douban.com/subject/25841341/
14 | http://movie.douban.com/subject/6890751/
15 | http://movie.douban.com/subject/25851768/
16 | http://movie.douban.com/subject/10485647/
17 | http://movie.douban.com/subject/24695967/
18 | http://movie.douban.com/subject/24707368/
19 | http://movie.douban.com/subject/10726941/
20 | http://movie.douban.com/subject/6878457/
21 | http://movie.douban.com/subject/25844367/
22 | http://movie.douban.com/subject/25804446/
23 | http://movie.douban.com/subject/25820625/
24 | http://movie.douban.com/subject/25869685/
25 | http://movie.douban.com/subject/25844581/
26 | http://movie.douban.com/subject/25863020/
27 | http://movie.douban.com/subject/24873473/
28 | http://movie.douban.com/subject/4746257/
29 | http://movie.douban.com/subject/4922789/
30 | http://movie.douban.com/subject/5421797/
31 | http://movie.douban.com/subject/25798808/
32 | http://movie.douban.com/subject/25758654/
33 | http://movie.douban.com/subject/2063914/
34 | http://movie.douban.com/subject/25862407/
35 | http://movie.douban.com/subject/20451334/
36 | http://movie.douban.com/subject/6873819/
37 | http://movie.douban.com/subject/7054604/
38 | http://movie.douban.com/subject/24879858/
39 | http://movie.douban.com/subject/6529847/
40 | http://movie.douban.com/subject/25755645/
41 | http://movie.douban.com/subject/25792690/
42 | http://movie.douban.com/subject/24851526/
43 | http://movie.douban.com/subject/20513060/
44 | http://movie.douban.com/subject/25792684/
45 | http://movie.douban.com/subject/25845586/
46 | http://movie.douban.com/subject/24847340/
47 | http://movie.douban.com/subject/24859034/
48 | http://movie.douban.com/subject/20513061/
49 | http://movie.douban.com/subject/25777330/
50 | http://movie.douban.com/subject/11443316/
51 | http://movie.douban.com/subject/4881607/
52 | http://movie.douban.com/subject/10807916/
53 | http://movie.douban.com/subject/6721670/
54 | http://movie.douban.com/subject/11610281/
55 | http://movie.douban.com/subject/7564989/
56 | http://movie.douban.com/subject/10604893/
57 | http://movie.douban.com/subject/20270795/
58 | http://movie.douban.com/subject/24695277/
59 | http://movie.douban.com/subject/24743712/
60 | http://movie.douban.com/subject/25717233/
61 | http://movie.douban.com/subject/24404677/
62 | http://movie.douban.com/subject/23048775/
63 | http://movie.douban.com/subject/25778491/
64 | http://movie.douban.com/subject/10545939/
65 | http://movie.douban.com/subject/25300674/
66 | http://movie.douban.com/subject/25798222/
67 | http://movie.douban.com/subject/25778488/
68 | http://movie.douban.com/subject/25746414/
69 | http://movie.douban.com/subject/6126442/
70 | http://movie.douban.com/subject/3078390/
71 | http://movie.douban.com/subject/3993588/
72 | http://movie.douban.com/subject/24879839/
73 | http://movie.douban.com/subject/10807909/
74 | http://movie.douban.com/subject/24298770/
75 | http://movie.douban.com/subject/25713420/
76 | http://movie.douban.com/subject/25823833/
77 | http://movie.douban.com/subject/25827963/
78 | http://movie.douban.com/subject/6082518/?from=playing_poster
79 | http://movie.douban.com/subject/24743711/?from=playing_poster
80 | http://movie.douban.com/subject/24843198/?from=playing_poster
81 | http://movie.douban.com/subject/11443314/?from=playing_poster
82 | http://movie.douban.com/subject/6973460/?from=playing_poster
83 | http://movie.douban.com/subject/7057975/?from=playing_poster
84 | http://movie.douban.com/subject/10810745/?from=playing_poster
85 | http://movie.douban.com/subject/20515977/?from=playing_poster
86 | http://movie.douban.com/subject/24163542/?from=playing_poster
87 | http://movie.douban.com/subject/3927791/?from=playing_poster
88 | http://movie.douban.com/subject/25804697/?from=playing_poster
89 | http://movie.douban.com/subject/6390823/?from=playing_poster
90 | http://movie.douban.com/subject/3731581/?from=playing_poster
91 | http://movie.douban.com/subject/22300822/?from=playing_poster
92 | http://movie.douban.com/subject/4919019/?from=playing_poster
93 | http://movie.douban.com/subject/21941804/?from=playing_poster
94 | http://movie.douban.com/subject/1437313/?from=playing_poster
95 | http://movie.douban.com/subject/2129132/?from=playing_poster
96 | http://movie.douban.com/subject/3273700/?from=playing_poster
97 | http://movie.douban.com/subject/21327518/?from=playing_poster
98 | http://movie.douban.com/subject/1300689/?from=playing_poster
99 | 


--------------------------------------------------------------------------------
/sina/sina_news.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'huafeng'
 2 | #coding:utf-8
 3 | import re
 4 | import os
 5 | import time
 6 | import codecs
 7 | import requests
 8 | import datetime
 9 | from bs4 import BeautifulSoup
10 | 
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | TIMESTAMP = time.strftime('%Y%m%d')
13 | # yesterday = (datetime.datetime.now().date()+datetime.timedelta(days=-1)).strftime('%Y_%m_%d')
14 | 
15 | def read_item_url_file():
16 |     whole_item_url_filename = os.path.join(PATH, 'sys', 'whole_item_url_%s'%TIMESTAMP)
17 |     timestamp_filename = time.strftime('%Y_%m_%d_%H%M00_sina_news')
18 |     data_directory = os.path.join(PATH, 'html', time.strftime('%Y_%m_%d'))
19 |     if not os.path.exists(data_directory):
20 |         os.system('mkdir %s'%data_directory)
21 |     output_filename = os.path.join(PATH, 'out', timestamp_filename)
22 |     failed_url_filename = os.path.join(PATH, 'log','sina_news_log')
23 |     with codecs.open(whole_item_url_filename, encoding='utf-8') as f,\
24 |     codecs.open(output_filename, mode='a', encoding='utf-8') as wf,\
25 |     codecs.open(failed_url_filename, mode='a', encoding='utf-8')as log_f:
26 |         url_list = [item.strip() for item in f.readlines()]
27 |         for url in url_list:
28 |             item_url_info_list = []
29 |             try:
30 |                 html = requests.get(url).text.encode('ISO-8859-1')
31 |                 splited_url = url.split('/')
32 |                 html_filename = '+'.join(splited_url[-3:])
33 |                 filename = os.path.join(PATH, data_directory, html_filename)
34 |                 with open(filename, mode='wb') as htmlwf:
35 |                     htmlwf.write(html)
36 |             except BaseException:
37 |                 log_f.write('timed out in item_url;%s\n'%url)
38 |                 continue
39 |             soup = BeautifulSoup(html, 'html5lib')
40 |             try:
41 |                 title = soup.find('h1', id='artibodyTitle').text.strip()
42 |                 item_url_info_list.append(title+'\n')
43 |                 div_lelvel_str = soup.find('div', id='artibody')
44 |                 p_level_list = div_lelvel_str.find_all('p')
45 |                 content_list = [item.text.strip()+'\n' for item in p_level_list]
46 |                 item_url_info_list.extend(content_list)
47 |             except BaseException:
48 |                 log_f.write('div do not match pattern in item_url;%s\n'%url)
49 |                 continue
50 |             if item_url_info_list:
51 |                 wf.writelines(item_url_info_list)
52 | # read_item_url_file()
53 | def write_item_url_into_file(item_url_list):
54 |     item_url_filename = os.path.join(PATH, 'sys', 'whole_item_url_%s'%TIMESTAMP)
55 |     with codecs.open(item_url_filename, mode='a', encoding='utf-8') as af:
56 |         temp_url_list_for_write = [item+'\n' for item in item_url_list]
57 |         af.writelines(temp_url_list_for_write)
58 | def get_realtime_news():
59 |     url_pattern = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=80&asc=&page=%s&r=0.30903104213777677'
60 |     start_page_num = 1
61 |     valid_timestamp_url_list = []
62 |     failed_url_filename = os.path.join(PATH, 'log','sina_news_log')
63 |     with codecs.open(failed_url_filename, mode='a', encoding='utf-8')as af:
64 |         for page_num in range(start_page_num, start_page_num+100):
65 |             url = url_pattern%page_num
66 |             try:
67 |                 res = requests.get(url)
68 |                 html = res.text.encode('ISO-8859-1')
69 |             except BaseException:
70 |                 try:
71 |                     res = requests.get(url)
72 |                     html = res.text.encode('ISO-8859-1')
73 |                 except BaseException:
74 |                     try:
75 |                         res = requests.get(url)
76 |                         html = res.text.encode('ISO-8859-1')
77 |                     except BaseException:
78 |                         af.write('timed out in page_url;%s\n'%url)
79 |                         continue
80 |             try:
81 |                 page_url_list = re.findall(r'url : "(http://.*\.shtml)"', html)
82 |                 url_list_timestamp = page_url_list[-1].split('/')[-2].replace('-', '')
83 |             except BaseException:
84 |                 af.write('div not pattern in page_url;%s\n'%url)
85 |                 continue
86 |             if url_list_timestamp != TIMESTAMP:
87 |                 for url in page_url_list:
88 |                     url_list_timestamp = url.split('/')[-2].replace('-','')
89 |                     if url_list_timestamp ==  TIMESTAMP:
90 |                         valid_timestamp_url_list.append(url)
91 |                 write_item_url_into_file(valid_timestamp_url_list)
92 |                 read_item_url_file()
93 |                 break
94 |             else:
95 |                 write_item_url_into_file(page_url_list)
96 | 
97 | if __name__ == '__main__':
98 |     get_realtime_news()
99 | 


--------------------------------------------------------------------------------
/dangdang/appliance/sys/whole_page_url:
--------------------------------------------------------------------------------
 1 | http://category.dangdang.com/cid4009643-pg1.html
 2 | http://category.dangdang.com/cid4009643-pg2.html
 3 | http://category.dangdang.com/cid4009643-pg3.html
 4 | http://category.dangdang.com/cid4009643-pg4.html
 5 | http://category.dangdang.com/cid4009643-pg5.html
 6 | http://category.dangdang.com/cid4009643-pg6.html
 7 | http://category.dangdang.com/cid4009643-pg7.html
 8 | http://category.dangdang.com/cid4009643-pg8.html
 9 | http://category.dangdang.com/cid4009643-pg9.html
10 | http://category.dangdang.com/cid4009643-pg10.html
11 | http://category.dangdang.com/cid4009642-pg1.html
12 | http://category.dangdang.com/cid4009642-pg2.html
13 | http://category.dangdang.com/cid4009642-pg3.html
14 | http://category.dangdang.com/cid4009642-pg4.html
15 | http://category.dangdang.com/cid4009642-pg5.html
16 | http://category.dangdang.com/cid4009642-pg6.html
17 | http://category.dangdang.com/cid4009642-pg7.html
18 | http://category.dangdang.com/cid4009642-pg8.html
19 | http://category.dangdang.com/cid4009642-pg9.html
20 | http://category.dangdang.com/cid4009642-pg10.html
21 | http://category.dangdang.com/cid4009642-pg11.html
22 | http://category.dangdang.com/cid4009635-pg1.html
23 | http://category.dangdang.com/cid4009635-pg2.html
24 | http://category.dangdang.com/cid4009635-pg3.html
25 | http://category.dangdang.com/cid4009635-pg4.html
26 | http://category.dangdang.com/cid4009635-pg5.html
27 | http://category.dangdang.com/cid4009635-pg6.html
28 | http://category.dangdang.com/cid4009635-pg7.html
29 | http://category.dangdang.com/cid4009635-pg8.html
30 | http://category.dangdang.com/cid4009635-pg9.html
31 | http://category.dangdang.com/cid4009635-pg10.html
32 | http://category.dangdang.com/cid4009635-pg11.html
33 | http://category.dangdang.com/cid4009635-pg12.html
34 | http://category.dangdang.com/cid4009635-pg13.html
35 | http://category.dangdang.com/cid4009635-pg14.html
36 | http://category.dangdang.com/cid4009635-pg15.html
37 | http://category.dangdang.com/cid4009635-pg16.html
38 | http://category.dangdang.com/cid4009635-pg17.html
39 | http://category.dangdang.com/cid4009636-pg1.html
40 | http://category.dangdang.com/cid4009636-pg2.html
41 | http://category.dangdang.com/cid4009636-pg3.html
42 | http://category.dangdang.com/cid4009636-pg4.html
43 | http://category.dangdang.com/cid4009636-pg5.html
44 | http://category.dangdang.com/cid4009636-pg6.html
45 | http://category.dangdang.com/cid4009636-pg7.html
46 | http://category.dangdang.com/cid4009636-pg8.html
47 | http://category.dangdang.com/cid4009636-pg9.html
48 | http://category.dangdang.com/cid4009636-pg10.html
49 | http://category.dangdang.com/cid4009636-pg11.html
50 | http://category.dangdang.com/cid4009636-pg12.html
51 | http://category.dangdang.com/cid4009636-pg13.html
52 | http://category.dangdang.com/cid4009636-pg14.html
53 | http://category.dangdang.com/cid4009637-pg1.html
54 | http://category.dangdang.com/cid4009637-pg2.html
55 | http://category.dangdang.com/cid4009637-pg3.html
56 | http://category.dangdang.com/cid4009637-pg4.html
57 | http://category.dangdang.com/cid4009637-pg5.html
58 | http://category.dangdang.com/cid4009637-pg6.html
59 | http://category.dangdang.com/cid4009637-pg7.html
60 | http://category.dangdang.com/cid4009637-pg8.html
61 | http://category.dangdang.com/cid4009637-pg9.html
62 | http://category.dangdang.com/cid4009638-pg1.html
63 | http://category.dangdang.com/cid4009638-pg2.html
64 | http://category.dangdang.com/cid4009638-pg3.html
65 | http://category.dangdang.com/cid4009638-pg4.html
66 | http://category.dangdang.com/cid4009638-pg5.html
67 | http://category.dangdang.com/cid4009638-pg6.html
68 | http://category.dangdang.com/cid4009638-pg7.html
69 | http://category.dangdang.com/cid4009638-pg8.html
70 | http://category.dangdang.com/cid4003236-pg1.html
71 | http://category.dangdang.com/cid4003236-pg2.html
72 | http://category.dangdang.com/cid4003236-pg3.html
73 | http://category.dangdang.com/cid4003236-pg4.html
74 | http://category.dangdang.com/cid4009645-pg1.html
75 | http://category.dangdang.com/cid4009645-pg2.html
76 | http://category.dangdang.com/cid4009645-pg3.html
77 | http://category.dangdang.com/cid4009645-pg4.html
78 | http://category.dangdang.com/cid4009645-pg5.html
79 | http://category.dangdang.com/cid4009645-pg6.html
80 | http://category.dangdang.com/cid4009640-pg1.html
81 | http://category.dangdang.com/cid4009640-pg2.html
82 | http://category.dangdang.com/cid4009640-pg3.html
83 | http://category.dangdang.com/cid4009639-pg1.html
84 | http://category.dangdang.com/cid4009639-pg2.html
85 | http://category.dangdang.com/cid4009639-pg3.html
86 | http://category.dangdang.com/cid4009647-pg1.html
87 | http://category.dangdang.com/cid4009647-pg2.html
88 | http://category.dangdang.com/cid4009647-pg3.html
89 | http://category.dangdang.com/cid4009641-pg1.html
90 | http://category.dangdang.com/cid4009641-pg2.html
91 | http://category.dangdang.com/cid4009646-pg1.html
92 | http://category.dangdang.com/cid4009646-pg2.html
93 | http://category.dangdang.com/cid4009646-pg3.html
94 | http://category.dangdang.com/cid4010054-pg1.html
95 | 


--------------------------------------------------------------------------------
/baidu/out/52:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/52.htm", "header": "\u673a\u5668\u7801", "content": "\u673a\u5668\u7801\u6307\u7684\u662f\u5c06\u786c\u4ef6\u5e8f\u5217\u53f7\u7ecf\u8fc7\u4e00\u7cfb\u5217\u52a0\u5bc6\u3001\u6563\u5217\u5f62\u6210\u7684\u4e00\u4e32\u5e8f\u5217\u53f7\u3002\u786c\u76d8\uff0cCPU\u90fd\u6709\u4e00\u4e2a\u65e0\u6cd5\u4fee\u6539\u7684\u8bc6\u522b\u7801\u3002\u7f51\u5361\u7684MAC\u5176\u5b9e\u4e5f\u7b97\u4e00\u79cd\uff0c\u4f46\u5b83\u662f\u53ef\u4ee5\u4eba\u4e3a\u4fee\u6539\u7684\u3002\u8f6f\u4ef6\u4e3a\u4e86\u9632\u6b62\u76d7\u7248\uff0c\u91c7\u53d6\u4e86\u4e00\u5b9a\u7684\u4fdd\u62a4\u63aa\u65bd\u3002\u5728\u7528\u6237\u6ce8\u518c\u7684\u65f6\u5019\u4f1a\u6839\u636e\u7528\u6237\u8f6f\u4ef6\u6240\u5b89\u88c5\u7684\u8ba1\u7b97\u673a\u8f6f\u786c\u4ef6\u4fe1\u606f\u751f\u6210\u552f\u4e00\u7684\u8bc6\u522b\u7801\uff0c\u4e00\u822c\u79f0\u4f5c\u673a\u5668\u7801\uff0c\u4e5f\u53eb\u5e8f\u5217\u53f7\u3001\u8ba4\u8bc1\u7801\u3001\u6ce8\u518c\u7533\u8bf7\u7801\u7b49\u3002\u673a\u5668\u7801\u4e00\u822c\u7528\u4f5c\u8f6f\u4ef6\u80fd\u591f\u552f\u4e00\u8bc6\u522b\u7684\u673a\u5668\uff0c\u6ce8\u518c\u8f6f\u4ef6\u65f6\u4f1a\u81ea\u52a8\u6839\u636e\u786c\u4ef6\u914d\u7f6e\u4ea7\u751f\u4e00\u4e32\u5e8f\u53f7\uff0c\u8fd9\u4e32\u5e8f\u53f7\u53eb\u673a\u5668\u7801\uff0c\u8f6f\u4ef6\u63d0\u4f9b\u5546\u4e00\u822c\u6839\u636e\u7528\u6237\u6240\u63d0\u4f9b\u7684\u673a\u5668\u7801\u6765\u4ea7\u751f\u552f\u4e00\u7684\u6ce8\u518c\u7801\uff0c\u8fd9\u6837\u6240\u4f7f\u7528\u7684\u8f6f\u4ef6\u5c31\u53ef\u4ee5\u6b63\u5e38\u5de5\u4f5c\u4e86\u3002\u4e0d\u8fc7\u6709\u4e9b\u9ed1\u5ba2\u4eec\u5229\u7528\u673a\u5668\u7801\u548c\u83b7\u5f97\u7684\u6ce8\u518c\u7801\u4e4b\u95f4\u7684\u5173\u7cfb\uff0c\u7814\u7a76\u51fa\u6ce8\u518c\u7801\u8ba1\u7b97\u5668\uff0c\u628a\u673a\u5668\u7801\u8f93\u5165\u8fdb\u53bb\uff0c\u7ecf\u8fc7\u76f8\u5e94\u7684\u7a0b\u5e8f\u8ba1\u7b97\u5c31\u80fd\u5f97\u5230\u6ce8\u518c\u7801\u3002\u673a\u5668\u7801\u662f\u7531\u60a8\u7684\u7535\u8111\u786c\u4ef6\u4fe1\u606f\u4ea7\u751f\uff0c\u4e0d\u4f1a\u6539\u53d8\uff0c\u5982\u679c\u60a8\u683c\u5f0f\u5316\u786c\u76d8\u6216\u91cd\u88c5\u7cfb\u7edf\u5219\u53ef\u80fd\u4f1a\u6539\u53d8\u3002\u5982\u679c\u4e2d\u4e86\u75c5\u6bd2\uff0c\u5219\u4e5f\u53ef\u80fd\u4f1a\u6539\u53d8\u3002\u00a0\u00a0\u7535\u8111\u4e0a\u7684\u673a\u5668\u7801\u4ec0\u4e48\u662f\u673a\u5668\u7801\uff1f\u673a\u5668\u7801\u662f\u8ba1\u7b97\u673a\u7684\u552f\u4e00\u7f16\u53f7\uff0c\u4e00\u822c\u53d6\u81ea\u8ba1\u7b97\u673a\u786c\u4ef6\u7684\u5e8f\u5217\u53f7\u3002\u6211\u4eec\u8f6f\u4ef6\u7684\u6ce8\u518c\u7cfb\u7edf\u4f9d\u8d56\u4e8e\u6b64\u673a\u5668\u7801\u533a\u5206\u4e0d\u540c\u7684\u8ba1\u7b97\u673a\u3002\u7531\u4e8e\u673a\u5668\u7801\u6e90\u81ea\u67d0\u4e9b\u786c\u4ef6\uff0c\u6240\u4ee5\u5f53\u60a8\u8ba1\u7b97\u673a\u7684\u786c\u4ef6\u6539\u53d8\u65f6\u53ef\u80fd\u5bfc\u81f4\u673a\u5668\u7801\u6539\u53d8\uff0c\u800c\u8fd9\u65f6\u6211\u4eec\u7684\u6ce8\u518c\u7cfb\u7edf\u4f1a\u8ba4\u4e3a\u662f\u4e00\u53f0\u65b0\u7684\u8ba1\u7b97\u673a\uff0c\u60a8\u53ea\u9700\u8981\u91cd\u65b0\u6ce8\u518c\u5b83\uff0c\u4f46\u91cd\u65b0\u6ce8\u518c\u53ef\u80fd\u51fa\u73b0\u201c\u5df2\u8d85\u8fc7\u6700\u5927\u6388\u6743\u6570\u201d\u7684\u9519\u8bef\uff0c\u8fd9\u662f\u56e0\u4e3a\u8001\u7684\u786c\u4ef6\u5df2\u88ab\u4f5c\u4e3a\u4e00\u53f0\u7535\u8111\u6ce8\u518c\u7684\u7f18\u6545\uff0c\u60a8\u53ea\u9700\u8981\u6309\u63d0\u793a\u7533\u8bf7\u64a4\u9500\u4e0a\u4e00\u53f0\u5c31\u53ef\u4ee5\u4e86\u3002\u8fd9\u662f\u5b8c\u5168\u6b63\u5e38\u7684\uff0c\u8bf7\u4e0d\u7528\u62c5\u5fc3\uff0c\u6ce8\u518c\u7801\u4e0d\u4f1a\u5931\u6548\uff0c\u8fd9\u79cd\u60c5\u51b5\u6211\u4eec\u4f1a\u4fdd\u969c\u6ce8\u518c\u7801\u53ef\u7528\u3002\u6211\u4eec\u53d6\u7684\u673a\u5668\u7801\u662f\u6765\u81ea\u786c\u76d8\u3001\u4e3b\u677f\u6216CPU\uff0c\u5f53\u8fd9\u4e09\u79cd\u786c\u4ef6\u6539\u53d8\u65f6\u53ef\u80fd\u4f1a\u81f4\u4f7f\u673a\u5668\u7801\u6539\u53d8\u800c\u9700\u8981\u91cd\u65b0\u6ce8\u518c\uff0c\u5176\u5b83\u786c\u4ef6\u6539\u53d8\u5219\u6ca1\u6709\u5f71\u54cd\u3002\u5176\u4e2d\u6700\u4e3b\u8981\u7684\u662f\u786c\u76d8\uff0c\u4e00\u822c\u60c5\u51b5\u4e0b\u786c\u76d8\u7684\u6539\u53d8\u90fd\u9700\u8981\u91cd\u65b0\u6ce8\u518c\u3002\u4e3b\u677f\u548cCPU\u53ea\u6709\u6781\u4e2a\u522b\u60c5\u51b5\u4e0b\u624d\u4f7f\u7528\u3002\u4ec0\u4e48\u65f6\u5019\u4f7f\u7528\u673a\u5668\u7801\uff1f\u4e00\u822c\u60c5\u51b5\u4e0b\uff0c\u60a8\u4e0d\u9700\u8981\u4f7f\u7528\u4e5f\u6ca1\u6709\u5fc5\u8981\u77e5\u9053\u6b64\u673a\u5668\u7801\uff0c\u56e0\u4e3a\u6240\u6709\u7684\u64cd\u4f5c\u90fd\u662f\u81ea\u52a8\u5b8c\u6210\u00a0\u00a0\u6ce8\u518c\u5e94\u7528\u7684\u673a\u5668\u7801\u7684\uff0c\u6ca1\u6709\u5fc5\u8981\u77e5\u9053\u5b83\u3002\u53ea\u6709\u5728\u6280\u672f\u652f\u6301\u65f6\uff0c\u5982\u679c\u6211\u4eec\u9700\u8981\u60a8\u63d0\u4f9b\u673a\u5668\u7801\uff0c\u8fd9\u65f6\u60a8\u624d\u80fd\u7528\u5230\u5b83\u3002\u5373\u4f7f\u5728\u5f53\u60a8\u65e0\u6cd5\u5b8c\u6210\u5728\u7ebf\u6ce8\u518c\u65f6\uff0c\u4e00\u822c\u4e5f\u4e0d\u9700\u8981\u624b\u5de5\u63d0\u4f9b\u673a\u5668\u7801\uff0c\u7a0b\u5e8f\u81ea\u52a8\u751f\u6210\u7684\u90ae\u4ef6\u4e2d\u5c31\u5305\u62ec\u4e86\u8fd9\u4e9b\u4fe1\u606f\uff0c\u4e07\u4e00\u60a8\u9700\u8981\u624b\u5de5\u4e66\u5199\u7533\u8bf7\u6ce8\u518c\u3001\u7533\u8bf7\u64a4\u9500\u3001\u7533\u8bf7\u7eed\u8ba2\u3001\u540c\u6b65\u7eed\u8ba2\u671f\u9650\u7b49\u90ae\u4ef6\u65f6\u624d\u9700\u8981\u624b\u5de5\u63d0\u4f9b\u8fd9\u4e9b\u673a\u5668\u7801\u4fe1\u606f\u3002"}


--------------------------------------------------------------------------------