├── AppSpider ├── action.py ├── mongodb_client.py └── www_zhiping_com_spider.py ├── README.md └── show_images ├── show1.png ├── show2.png └── show3.png /AppSpider/action.py: -------------------------------------------------------------------------------- 1 | import time 2 | class Action(): 3 | def __init__(self,device): 4 | self.d = device 5 | self.Width = device.info.get("displayWidth") 6 | self.Height = device.info.get("displayHeight") 7 | 8 | def getDevice(self): 9 | return self.d 10 | # 向上翻页 11 | def ToUp(self): 12 | self.d(scrollable=True).scroll.vert.forward() 13 | 14 | # 向右翻页 15 | def ToRight(self): 16 | self.d.swipe(self.Width * 0.93, self.Width * 0.93, self.Height * 0.05, self.Height * 0.56) 17 | 18 | # 点击 19 | def ToClick(self): 20 | self.d.click(self.Width * 0.45, self.Height * 0.18) 21 | 22 | 23 | # 找不到就循环向上滑动 24 | def whileToUp(self, resourceId): 25 | while (not self.d(className="android.widget.TextView", resourceId=resourceId).exists(2)): 26 | self.ToUp() 27 | 28 | 29 | def init(self,keyword): 30 | self.d(text="BOSS直聘").click() 31 | time.sleep(5) 32 | self.d.click(self.Width * 0.9, self.Height * 0.08) 33 | et_search = self.d(resourceId="com.hpbr.bosszhipin:id/et_search") 34 | if et_search.exists(5): 35 | et_search.set_text(keyword) 36 | else: 37 | return "爬虫搜索出现错误" 38 | self.d.click(self.Width * 0.93, self.Height * 0.95) 39 | time.sleep(2) 40 | self.d.click(self.Width * 0.5, self.Height * 0.28) -------------------------------------------------------------------------------- /AppSpider/mongodb_client.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | 3 | class Mongo(): 4 | def __init__(self,client,db,col): 5 | self.myclient = pymongo.MongoClient(client) 6 | self.mydb = self.myclient[db] 7 | self.mycol = self.mydb[col] 8 | 9 | def insert_one(self,dict): 10 | return self.mycol.insert_one(dict).inserted_id 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /AppSpider/www_zhiping_com_spider.py: -------------------------------------------------------------------------------- 1 | import uiautomator2 as us2 2 | from AppSpider.mongodb_client import Mongo 3 | import time 4 | from AppSpider.action import Action 5 | 6 | #拼接字符串 7 | def SplitString(resourceId): 8 | return "//android.widget.TextView[@resource-id ='{}']".format(resourceId) 9 | 10 | #爬取 11 | def spider(action): 12 | result = {} 13 | 14 | d = action.getDevice() 15 | #工资 16 | result['salary'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_job_salary')).get_text() 17 | 18 | #工作地点 19 | result['location'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_required_location')).get_text() 20 | 21 | #工作经验 22 | result['work_exp'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_required_work_exp')).get_text() 23 | 24 | #学历 25 | result['degree'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_required_degree')).get_text() 26 | 27 | #职位名称 28 | result['job_name'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_job_name')).get_text() 29 | 30 | #职位标签 31 | xpath_tags = d(className="android.view.ViewGroup", resourceId="com.hpbr.bosszhipin:id/flexboxLayout") 32 | tag_list = [] 33 | index = 0 34 | while (True): 35 | try: 36 | tag = xpath_tags.child(index=str(index), className="android.widget.TextView").get_text(2) 37 | except: 38 | break 39 | tag_list.append(tag) 40 | index = index + 1 41 | result['tag_list'] = tag_list 42 | 43 | action.whileToUp("com.hpbr.bosszhipin:id/tv_description") 44 | #职位描述 45 | description =d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_description')).get_text() 46 | if "查看全部" in description: 47 | action.ToUp() 48 | time.sleep(1) 49 | action.ToClick() 50 | description = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_description')).get_text() 51 | result['description'] = description 52 | 53 | action.whileToUp("com.hpbr.bosszhipin:id/tv_com_name") 54 | #公司名称 55 | result['com_name'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_com_name')).get_text() 56 | 57 | action.whileToUp("com.hpbr.bosszhipin:id/tv_com_info") 58 | #公司规模 59 | result['com_info'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_com_info')).get_text() 60 | 61 | action.whileToUp("com.hpbr.bosszhipin:id/tv_location") 62 | #公司位置 63 | result['tv_location'] = d.xpath(SplitString('com.hpbr.bosszhipin:id/tv_location')).get_text() 64 | 65 | return result 66 | 67 | def start(): 68 | keyword = input("请输入爬取的关键词:") 69 | client = "mongodb://localhost:27017/" 70 | db = "spider_data" 71 | col = keyword 72 | mongo = Mongo(client, db, col) 73 | device = us2.connect() 74 | action = Action(device) 75 | action.init(keyword) 76 | num = 1 77 | while(True): 78 | try: 79 | result = spider(action) 80 | except: 81 | break 82 | mongo.insert_one(result) 83 | print("已爬取到:{}条数据".format(str(num))) 84 | num += 1 85 | time.sleep(2) 86 | action.ToRight() 87 | time.sleep(1) 88 | print("爬虫终止,共爬取"+str(num)+"条数据!!!") 89 | 90 | if __name__ == "__main__": 91 | start() 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # boss直聘招聘信息爬取 2 | 3 | > ### 需求 4 | 爬取boos职位详情、工资、公司地点等信息 5 | 6 | 7 | > ### 分析 8 | 我们可以通过selenium模块去模拟用户在浏览器的操作,去爬取信息,但官网反爬虫机制越来越强大,很容易把ip封掉 9 | ![error](./show_images/show1.png) 10 | 11 | 但我们可以通过uiautomator2模块自动化爬取boss直聘App,需要一部android手机和USB数据线连接电脑,打开android手机的USB调试模式,把boss直聘app放在首页,运行python脚本即可,然后把爬取到的结果存入MongoDB 12 | 13 | 虽然此方案爬取效率不是很高,但比较稳定 14 | 15 | > ### 结果 16 | ![show](./show_images/show3.png) 17 | ![show](./show_images/show2.png) 18 | -------------------------------------------------------------------------------- /show_images/show1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Brant-lzh/zhipin_com_spider/5aa518175c9d2956042473474b44d418ae9f1684/show_images/show1.png -------------------------------------------------------------------------------- /show_images/show2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Brant-lzh/zhipin_com_spider/5aa518175c9d2956042473474b44d418ae9f1684/show_images/show2.png -------------------------------------------------------------------------------- /show_images/show3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Brant-lzh/zhipin_com_spider/5aa518175c9d2956042473474b44d418ae9f1684/show_images/show3.png --------------------------------------------------------------------------------