├── .gitignore ├── LICENSE ├── README.md ├── joble ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── scraper │ └── google.py ├── settings.py └── spiders │ ├── __init__.py │ ├── glassdoor.py │ ├── monsterindia.py │ └── naukri.py ├── requirements.txt └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Ravishankar Chavare 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Joble 2 | This Platform Search Thousands Of Job Boards In Different Technologies From Over The World . 3 | 4 | 5 | ## Installation 6 | 7 | step 1 : Create clone of Joble. 8 | ``` 9 | git clone https://github.com/chavarera/Joble.git 10 | ``` 11 | step 2 : Change working directory to Joble 12 | ``` 13 | cd Joble 14 | ``` 15 | step 3 : Create Virtual environment 16 | ``` 17 | virtualenv -p python3 vnev 18 | ``` 19 | 20 | setp 4 : Activate Virtual environment 21 | ``` 22 | source vnev/bin/activate 23 | ``` 24 | 25 | step 5 : Install required packages 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | step 6 : Now Exceute Spiders 31 | 32 | 33 | 34 | ## List of Spiders Available 35 | 1. Naukri 36 | 2. MonsterIndia 37 | 38 | 39 | ### 1. Naukri 40 | Get 20 Jobs per catgory 41 | ``` 42 | scrapy crawl Naukri 43 | ``` 44 | 45 | Available Option 46 | 1. city 47 | 2. count 48 | 3. keyword 49 | 50 | For Example 51 | ``` 52 | scrapy crawl Naukri -a keyword=python -a count=20 -a city=pune 53 | 54 | ``` 55 | 56 | Export Output in csv,json 57 | ``` 58 | scrapy crawl Naukri -a keyword=python -o python.csv 59 | 60 | ``` 61 | 62 | ### 2. MonsterIndia 63 | Available Option 64 | 1. city 65 | 2. count 66 | 3. keyword 67 | 68 | For Example 69 | ``` 70 | scrapy crawl MonsterIndia -a keyword=python -a 71 | ``` -------------------------------------------------------------------------------- /joble/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python-World/Joble/b44ea73ff798b52df036810526ecfe2775b5d8e1/joble/__init__.py -------------------------------------------------------------------------------- /joble/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JobleItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /joble/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JobleSpiderMiddleware: 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class JobleDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info("Spider opened: %s" % spider.name) 104 | -------------------------------------------------------------------------------- /joble/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JoblePipeline: 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /joble/scraper/google.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | # function to remove random characters from the end of the url if exists 8 | def fix_url(url_list): 9 | if len(url_list[-1]) > 8: 10 | url_list[-1] = url_list[-1].split("&")[0] 11 | return url_list 12 | 13 | 14 | # return the link for carrers page from google 15 | def get_carrer_page(name): 16 | query = name.replace(" ", "+") 17 | URL = f"https://google.com/search?q={query}+carrers" 18 | 19 | resp = requests.get(URL) 20 | if "." in name: 21 | name = name.split(".")[0] 22 | 23 | if resp.status_code == 200: 24 | soup = BeautifulSoup(resp.content, "html.parser") 25 | 26 | for a in soup.find_all("a", href=True): 27 | if "url" in a["href"]: 28 | # removes '/url?q=' from the start of the url 29 | url_list = a["href"][7:].split("/") 30 | if name in url_list[2]: 31 | return "/".join(fix_url(url_list)) 32 | break 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("name", help="name of the company", type=str) 38 | args = parser.parse_args() 39 | 40 | url = get_carrer_page(args.name) 41 | print(url) 42 | -------------------------------------------------------------------------------- /joble/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for joble project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = "joble" 13 | 14 | SPIDER_MODULES = ["joble.spiders"] 15 | NEWSPIDER_MODULE = "joble.spiders" 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | # USER_AGENT = 'joble (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | # CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | # COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | # TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | # DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | # } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'joble.middlewares.JobleSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'joble.middlewares.JobleDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | # ITEM_PIPELINES = { 68 | # 'joble.pipelines.JoblePipeline': 300, 69 | # } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /joble/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /joble/spiders/glassdoor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import urllib 3 | 4 | import scrapy 5 | 6 | 7 | # scrapy crawl Glassdoor 8 | class GlassdoorSpider(scrapy.Spider): 9 | name = "Glassdoor" 10 | allowed_domains = ["glassdoor.com"] 11 | url = "https://www.glassdoor.com" 12 | 13 | def __init__(self, keyword=None, count=20): 14 | self.keyword = keyword 15 | self.count = int(count) 16 | 17 | def start_requests(self): 18 | url = "{}/Job/jobs.htm?sc.keyword={}".format(self.url, self.keyword) 19 | yield scrapy.Request(url=url, callback=self.parse) 20 | 21 | def parse(self, response): 22 | elements = response.css("ul.jlGrid li.react-job-listing") 23 | 24 | for element in elements[: self.count]: 25 | job = { 26 | "title": element.attrib["data-normalize-job-title"], 27 | "location": element.attrib["data-job-loc"], 28 | "employer": self.url 29 | + element.css("div div.jobHeader a span::text").get(), 30 | "job-link": element.css( 31 | "div div.jobHeader a::attr(href)" 32 | ).get(), 33 | } 34 | yield job 35 | -------------------------------------------------------------------------------- /joble/spiders/monsterindia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from urllib.parse import urlencode, urljoin 3 | 4 | import scrapy 5 | from scrapy import Request 6 | 7 | 8 | # Execute : scrapy crawl MonsterIndia -a keyword=python 9 | class MonsterindiaSpider(scrapy.Spider): 10 | name = "MonsterIndia" 11 | allowed_domains = ["monsterindia.com"] 12 | start_urls = ["http://monsterindia.com/"] 13 | 14 | def __init__(self, keyword, count=20, location=None): 15 | self.URL = "http://monsterindia.com/" 16 | self.count = count 17 | self.keyword = keyword 18 | self.location = location 19 | 20 | def get_url(self): 21 | base_url = "https://www.monsterindia.com/middleware/jobsearch?" 22 | params = { 23 | "sort": "2", 24 | "limit": self.count, 25 | "query": self.keyword, 26 | "locations": self.location, 27 | } 28 | if self.location is None: 29 | params.pop("locations") 30 | 31 | return {"url": base_url, "body": urlencode(params)} 32 | 33 | def parse(self, response): 34 | url = self.get_url() 35 | yield Request( 36 | url["url"] + url["body"], meta={"url": url}, callback=self.JobData 37 | ) 38 | 39 | def JobData(self, response): 40 | data = response.text 41 | data = data.replace("false", "False") 42 | data = data.replace("true", "True") 43 | jobdata = eval(data) 44 | if jobdata.get("jobSearchResponse"): 45 | for record in jobdata["jobSearchResponse"]["data"]: 46 | headers = [ 47 | "jobId", 48 | "title", 49 | "locations", 50 | "updatedAt", 51 | "summary", 52 | "skills", 53 | "companyName", 54 | "seoJdUrl", 55 | ] 56 | job_details = {} 57 | for head in headers: 58 | try: 59 | job_details[head] = record.get(head) 60 | except Exception as ex: 61 | print("error in head:", head, ex) 62 | job_details["seoJdUrl"] = urljoin( 63 | self.URL, job_details["seoJdUrl"] 64 | ) 65 | if job_details["jobId"]: 66 | yield job_details 67 | -------------------------------------------------------------------------------- /joble/spiders/naukri.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import urllib 3 | 4 | import scrapy 5 | 6 | 7 | # scrapy crawl Naukri 8 | class NaukriSpider(scrapy.Spider): 9 | name = "Naukri" 10 | allowed_domains = ["naukri.com"] 11 | start_urls = ["https://www.naukri.com"] 12 | 13 | def __init__(self, keyword=None, count=20, city=None): 14 | self.count = count 15 | self.keyword = keyword 16 | self.city = city 17 | 18 | def get_url(self): 19 | base_url = "https://www.naukri.com/jobapi/v3/search?" 20 | params = { 21 | "noOfResults": self.count, 22 | "urlType": "search_by_key_loc", 23 | "searchType": "adv", 24 | "keyword": self.keyword, 25 | "location": self.city, 26 | "sort": "r", 27 | "k": self.keyword, 28 | "l": self.city, 29 | "seoKey": "{}-jobs-in-{}".format(self.keyword, self.city) 30 | if self.city 31 | else "{}-jobs".format(self.keyword), 32 | "src": "jobsearchDesk", 33 | "latLong": "", 34 | } 35 | default = ["keyword", "sort", "l", "k", "location"] 36 | if self.city is None: 37 | for key in default: 38 | params.pop(key) 39 | return {"url": base_url, "body": params} 40 | 41 | def parse(self, response): 42 | if self.keyword: 43 | record = self.get_url() 44 | yield scrapy.Request( 45 | url=record["url"] + urllib.parse.urlencode(record["body"]), 46 | headers={ 47 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0", 48 | "appid": "109", 49 | "systemid": "109", 50 | }, 51 | meta={"keyword": self.keyword}, 52 | callback=self.jobData, 53 | ) 54 | else: 55 | yield scrapy.Request( 56 | "https://www.naukri.com/jobs-by-category", 57 | callback=self.get_by_category, 58 | ) 59 | 60 | def get_by_category(self, response): 61 | for j in response.xpath('//div[@class="lmrWrap wrap"]/div/div/div/a'): 62 | title = j.xpath("text()").get().strip() 63 | url = j.xpath("@href").get().strip() 64 | yield scrapy.Request( 65 | url, 66 | callback=self.job_list, 67 | meta={"keyword": title, "count": 0, "plink": url}, 68 | ) 69 | 70 | def job_list(self, response): 71 | plink = response.meta["plink"].split("/")[-1] 72 | keyword = plink.split("-jobs")[0] 73 | seokeys = keyword + "-jobs" 74 | ids = plink.split("=")[-1] 75 | joburl = "https://www.naukri.com/jobapi/v3/search?noOfResults=20&urlType=search_by_keyword&searchType=adv&keyword={}&xt=catsrch&functionAreaId={}&seoKey={}&src=jobsearchDesk&latLong=".format( 76 | keyword, ids, seokeys 77 | ) 78 | yield scrapy.Request( 79 | joburl, 80 | headers={ 81 | "Referer": response.meta["plink"], 82 | "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0", 83 | "appid": "109", 84 | "systemid": "109", 85 | }, 86 | meta={ 87 | "url": response.meta["plink"], 88 | "keyword": keyword, 89 | "ids": ids, 90 | "seokeys": seokeys, 91 | }, 92 | callback=self.jobData, 93 | ) 94 | 95 | def jobData(self, response): 96 | data = response.text 97 | data = data.replace("false", "False") 98 | data = data.replace("true", "True") 99 | jobdata = eval(data) 100 | if jobdata.get("jobDetails"): 101 | for job in jobdata["jobDetails"]: 102 | place = job["placeholders"] 103 | detail = {} 104 | for p in place: 105 | key, value = p.values() 106 | detail[key] = value 107 | details = { 108 | "category": response.meta["keyword"], 109 | "title": job["title"], 110 | "jobId": job["jobId"], 111 | "companyName": job["companyName"], 112 | "skills": job.get("tagsAndSkills"), 113 | "joburl": job["jdURL"], 114 | "postedon": job["footerPlaceholderLabel"], 115 | "descreption": job.get("jobDescription"), 116 | } 117 | final_result = {**detail, **details} 118 | yield final_result 119 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==19.3.0 2 | Automat==20.2.0 3 | beautifulsoup4==4.9.2 4 | cffi==1.14.0 5 | constantly==15.1.0 6 | cryptography==41.0.4 7 | cssselect==1.1.0 8 | hyperlink==19.0.0 9 | idna==2.10 10 | incremental==17.5.0 11 | itemadapter==0.1.0 12 | lxml==4.9.1 13 | parsel==1.6.0 14 | pkg-resources==0.0.0 15 | Protego==0.1.16 16 | pyasn1==0.4.8 17 | pyasn1-modules==0.2.8 18 | pycparser==2.20 19 | PyDispatcher==2.0.5 20 | PyHamcrest==2.0.2 21 | pyOpenSSL==19.1.0 22 | queuelib==1.5.0 23 | requests==2.31.0 24 | Scrapy==2.6.2 25 | service-identity==18.1.0 26 | six==1.15.0 27 | Twisted==22.10.0 28 | w3lib==1.22.0 29 | zope.interface==5.1.0 30 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = joble.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = joble 12 | --------------------------------------------------------------------------------