├── .gitignore ├── README.md ├── cookies.txt ├── images ├── pixiv_0.png ├── pixiv_1.png ├── pixiv_2.png ├── pixiv_3.png ├── pixiv_4.png ├── pixiv_5.png ├── pixiv_6.jpg └── pixiv_7.jpg ├── pixiv.py └── pixiv_.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | /data_/ 9 | .idea 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # P 站非会员查看人气作品 2 | 3 | 这个项目来源于我的一篇文章 [缺手机壁纸?来看看:也来看看](https://zhuanlan.zhihu.com/p/27466844) 下面的一条评论 4 | 5 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_5.png) 6 | 7 | 其实一开始我是不知道 P 站是什么 8 | 9 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_6.jpg) 10 | 11 | 某度了一下,原来是日本一个插画网站,但是这个网站看人气作品是要会员的。 12 | 13 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_2.png) 14 | 15 | 然后我就又有个大胆的想法了,把插图的的连接和 star 数爬取下来,然后进行排序,这样就可以看到人气高的作品了。 16 | 17 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_7.jpg) 18 | 19 | 第一次爬取网站内容,发现没有插图内容,应该是要保持登录状态才行。为此我注册了个账号,目的是为了获取 cookies。F12 获取 cookies 20 | 21 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_1.png) 22 | 23 | 将 cookies 保存在项目下的 cookies.txt 文件里。在代码中组装 cookies 内容 24 | ```python 25 | def cookies(self): 26 | with open("cookies.txt", 'r') as f: 27 | _cookies = {} 28 | for row in f.read().split(';'): 29 | k, v = row.strip().split('=', 1) 30 | _cookies[k] = v 31 | return _cookie 32 | ``` 33 | 34 | 测试一下,爬取 'summer' 关键词前 500 页信息。 35 | 36 | ```python 37 | urls = get_urls("summer", 500) 38 | ``` 39 | 40 | 效果如下 41 | 42 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_3.png) 43 | 44 | 点击排第一的连接 45 | 46 | ![](https://github.com/chenjiandongx/pixiv/blob/master/images/pixiv_4.png) 47 | 48 | 诚不欺我,确实高人气 49 | 不过这个单线程版本(pixiv.py)爬取太多页的话速度有点慢,所以又写了个多线程版本的(pixiv_.py),速度蹭蹭就上去了。 50 | 51 | > 技术本身是无罪的。 —— 原快播CEO王欣 52 | 53 | 当然还是希望有能力的同学支持下这个网站,充个会员。但是,我!就!不! 54 | -------------------------------------------------------------------------------- /cookies.txt: -------------------------------------------------------------------------------- 1 | p_ab_id=5; p_ab_id_2=9; login_ever=yes; device_token=c8c37fdf24b917b4e7fb191fe11c5ca5; search_tools_toggle=1; _ga=GA1.2.887334537.1498628532; PHPSESSID=25745470_75a76e86ff3145b53e21b440183b4822; a_type=0; is_sensei_service_user=1; module_orders_mypage=%5B%7B%22name%22%3A%22recommended_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22everyone_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22following_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22mypixiv_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22fanbox%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22featured_tags%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22contests%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22sensei_courses%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22spotlight%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22booth_follow_items%22%2C%22visible%22%3Atrue%7D%5D; __utmt=1; __utma=235335808.887334537.1498628532.1498720739.1498819600.5; __utmb=235335808.1.10.1498819600; __utmc=235335808; __utmz=235335808.1498713152.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^5=gender=male=1^6=user_id=25745470=1^9=p_ab_id=5=1^10=p_ab_id_2=9=1^11=lang=zh_tw=1 -------------------------------------------------------------------------------- /images/pixiv_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_0.png -------------------------------------------------------------------------------- /images/pixiv_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_1.png -------------------------------------------------------------------------------- /images/pixiv_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_2.png -------------------------------------------------------------------------------- /images/pixiv_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_3.png -------------------------------------------------------------------------------- /images/pixiv_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_4.png -------------------------------------------------------------------------------- /images/pixiv_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_5.png -------------------------------------------------------------------------------- /images/pixiv_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_6.jpg -------------------------------------------------------------------------------- /images/pixiv_7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/pixiv/21bb2df2eca873bc38593ffa681f10e7c3a9dec1/images/pixiv_7.jpg -------------------------------------------------------------------------------- /pixiv.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from pprint import pprint 4 | 5 | class Pixiv(): 6 | 7 | def __init__(self, search, page): 8 | self.search = search 9 | self.page = page 10 | self.result = set() 11 | self.headers = { 12 | 'X-Requested-With': 'XMLHttpRequest', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 14 | 'Chrome/56.0.2924.87 Safari/537.36'} 15 | 16 | @property 17 | def cookies(self): 18 | with open("cookies.txt", 'r') as f: 19 | _cookies = {} 20 | for row in f.read().split(';'): 21 | k, v = row.strip().split('=', 1) 22 | _cookies[k] = v 23 | return _cookies 24 | 25 | def run(self): 26 | fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' 27 | urls = [fmt.format(self.search, p) for p in range(1, self.page)] 28 | total = 1 29 | for url in urls: 30 | req = requests.get(url, headers=self.headers, cookies=self.cookies).text 31 | bs = BeautifulSoup(req, 'lxml').find('ul', class_="_image-items autopagerize_page_element") 32 | for b in bs.find_all('li', class_="image-item"): 33 | try: 34 | href = b.find('a', class_="work _work ")['href'] 35 | star = b.find('ul', class_="count-list").find('li').find('a').text 36 | self.result.add(("https://www.pixiv.net{}".format(href), int(star))) 37 | print(total) 38 | total += 1 39 | except: 40 | pass 41 | pprint(sorted(self.result, key=lambda v: v[1], reverse=True)) # 按star数降序排序 42 | 43 | if __name__ == "__main__": 44 | spider = Pixiv("winter", 100) 45 | spider.run() 46 | -------------------------------------------------------------------------------- /pixiv_.py: -------------------------------------------------------------------------------- 1 | from concurrent import futures 2 | import threading 3 | 4 | from pprint import pprint 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | headers = { 9 | 'X-Requested-With': 'XMLHttpRequest', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 11 | 'Chrome/56.0.2924.87 Safari/537.36' 12 | } 13 | 14 | 15 | def get_cookies(): 16 | with open("cookies.txt", 'r') as f: 17 | _cookies = {} 18 | for row in f.read().split(';'): 19 | k, v = row.strip().split('=', 1) 20 | _cookies[k] = v 21 | return _cookies 22 | 23 | cookies = get_cookies() 24 | result = set() 25 | lock = threading.Lock() # 多线程全局资源锁 26 | total = 1 27 | 28 | def crawl(url): 29 | global total 30 | req = requests.get(url, headers=headers, cookies=cookies).text 31 | bs = BeautifulSoup(req, 'lxml').find('ul', class_="_image-items autopagerize_page_element") 32 | for b in bs.find_all('li', class_="image-item"): 33 | try: 34 | with lock: 35 | href = b.find('a', class_="work _work ")['href'] 36 | star = b.find('ul', class_="count-list").find('li').find('a').text 37 | result.add(("https://www.pixiv.net" + href, int(star))) 38 | print(total) 39 | total += 1 40 | except: 41 | pass 42 | 43 | 44 | def get_urls(search, page): 45 | fmt = 'https://www.pixiv.net/search.php?word={}&order=date_d&p={}' 46 | return [fmt.format(search, p) for p in range(1, page)] 47 | 48 | 49 | if __name__ == "__main__": 50 | urls = get_urls("summer", 500) 51 | with futures.ThreadPoolExecutor(32) as executor: 52 | executor.map(crawl, urls) 53 | pprint(sorted(result, key=lambda v: v[1], reverse=True)) # 按star数降序排序 54 | --------------------------------------------------------------------------------