├── README.md
├── Requirements.txt
├── data.xlsx
├── samplecode.txt
└── web scraping
    ├── g1
        ├── __init__.pyc
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        ├── settings.pyc
        └── spiders
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── example.py
        │   └── example.pyc
    ├── glassdoor data
        ├── __init__.pyc
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        ├── settings.pyc
        └── spiders
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── example.py
        │   └── example.pyc
    ├── out.csv
    └── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
1 | # Web-scraping-job-portal-sites
2 | The framework can be used to scrape data from various job portals like glassdoor.com, monster.com.
3 | Data extracted : Company name, industry, skills, education level, career level, salary details, job type etc.
4 | 
5 | How to debug the project:
6 | For debugging python code we can make use of pdb debugger, tutorial : <a href="https://www.getechready.com/python-debugger-pdb/">python pdb debugger</a>
7 | 


--------------------------------------------------------------------------------
/Requirements.txt:
--------------------------------------------------------------------------------
1 | ----------REQUIREMENTS----------
2 | *Python2.7
3 | *Scrapy Framework
4 | *Docker
5 | *Splash
6 | 


--------------------------------------------------------------------------------
/data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/data.xlsx


--------------------------------------------------------------------------------
/samplecode.txt:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | class MonsterComSpider(scrapy.Spider):
 4 |     name = 'monsterca'
 5 |     #allowed_domains = ['www.monster.ca']
 6 |     start_urls = ['https://www.monster.ca/jobs/search/?q=data-analyst&page=1']
 7 |     def parse(self, response):
 8 |         urls = response.css('div.jobTitle > h2 > a::attr(href)').extract()
 9 |        
10 |         for url in urls:
11 |             yield scrapy.Request(url = url, callback = self.parse_details)
12 | 
13 |     #crawling all the pages
14 |         next_page_url = response.xpath('//head/link[@rel="next"]/@href').extract_first()
15 |         
16 |         if next_page_url:
17 |            next_page_url = response.urljoin(next_page_url) 
18 |            yield scrapy.Request(url = next_page_url, callback = self.parse)            
19 | 
20 | 
21 |     def parse_details(self,response):
22 |          if response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul'):
23 |               yield {         
24 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
25 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
26 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul > li::text').extract())
27 |                      }
28 |          elif response.css('div[id = JobDescription] > span[id = TrackingJobBody]'):
29 |             yield {         
30 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
31 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
32 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]').xpath(".//text()").extract()),
33 |                       'Description2' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]::text').extract())
34 |                      }
35 |              
36 |  
37 |        
38 |         #'Description' : response.css('div[id = JobDescription]').extract()
39 |                   
40 |             
41 |         


--------------------------------------------------------------------------------
/web scraping/g1/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/g1/__init__.pyc


--------------------------------------------------------------------------------
/web scraping/g1/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class G1Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/web scraping/g1/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class G1SpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/web scraping/g1/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class G1Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/web scraping/g1/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for g1 project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'g1'
 13 | 
 14 | SPIDER_MODULES = ['g1.spiders']
 15 | NEWSPIDER_MODULE = 'g1.spiders'
 16 | 
 17 | 
 18 | SPLASH_URL = 'http://192.168.99.100:8050'
 19 | DOWNLOADER_MIDDLEWARES = {
 20 |         'scrapy_splash.SplashCookiesMiddleware': 723,
 21 |         'scrapy_splash.SplashMiddleware': 725,
 22 |         'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 23 |         }
 24 | SPIDER_MIDDLEWARES = {
 25 |         'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 
 26 |         }
 27 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 28 | #HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 29 | 
 30 | 
 31 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 32 | #USER_AGENT = 'g1 (+http://www.yourdomain.com)'
 33 | 
 34 | # Obey robots.txt rules
 35 | ROBOTSTXT_OBEY = False
 36 | 
 37 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 38 | #CONCURRENT_REQUESTS = 32
 39 | 
 40 | # Configure a delay for requests for the same website (default: 0)
 41 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 42 | # See also autothrottle settings and docs
 43 | #DOWNLOAD_DELAY = 3
 44 | # The download delay setting will honor only one of:
 45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 46 | #CONCURRENT_REQUESTS_PER_IP = 16
 47 | 
 48 | # Disable cookies (enabled by default)
 49 | #COOKIES_ENABLED = False
 50 | 
 51 | # Disable Telnet Console (enabled by default)
 52 | #TELNETCONSOLE_ENABLED = False
 53 | 
 54 | # Override the default request headers:
 55 | #DEFAULT_REQUEST_HEADERS = {
 56 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 57 | #   'Accept-Language': 'en',
 58 | #}
 59 | 
 60 | # Enable or disable spider middlewares
 61 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 62 | #SPIDER_MIDDLEWARES = {
 63 | #    'g1.middlewares.G1SpiderMiddleware': 543,
 64 | #}
 65 | 
 66 | # Enable or disable downloader middlewares
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 68 | #DOWNLOADER_MIDDLEWARES = {
 69 | #    'g1.middlewares.MyCustomDownloaderMiddleware': 543,
 70 | #}
 71 | 
 72 | # Enable or disable extensions
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 74 | #EXTENSIONS = {
 75 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 76 | #}
 77 | 
 78 | # Configure item pipelines
 79 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 80 | #ITEM_PIPELINES = {
 81 | #    'g1.pipelines.G1Pipeline': 300,
 82 | #}
 83 | 
 84 | # Enable and configure the AutoThrottle extension (disabled by default)
 85 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 86 | #AUTOTHROTTLE_ENABLED = True
 87 | # The initial download delay
 88 | #AUTOTHROTTLE_START_DELAY = 5
 89 | # The maximum download delay to be set in case of high latencies
 90 | #AUTOTHROTTLE_MAX_DELAY = 60
 91 | # The average number of requests Scrapy should be sending in parallel to
 92 | # each remote server
 93 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 94 | # Enable showing throttling stats for every response received:
 95 | #AUTOTHROTTLE_DEBUG = False
 96 | 
 97 | # Enable and configure HTTP caching (disabled by default)
 98 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 99 | #HTTPCACHE_ENABLED = True
100 | #HTTPCACHE_EXPIRATION_SECS = 0
101 | #HTTPCACHE_DIR = 'httpcache'
102 | #HTTPCACHE_IGNORE_HTTP_CODES = []
103 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
104 | 


--------------------------------------------------------------------------------
/web scraping/g1/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/g1/settings.pyc


--------------------------------------------------------------------------------
/web scraping/g1/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/web scraping/g1/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/g1/spiders/__init__.pyc


--------------------------------------------------------------------------------
/web scraping/g1/spiders/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | main_url = "https://www.glassdoor.ca"
 5 | class ExampleSpider(scrapy.Spider):
 6 |     name = 'example'
 7 |     #allowed_domains = ['https://www.glassdoor.co.in/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
 8 |     start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
 9 |     main_url = "https://www.glassdoor.ca"
10 |     #handling javascript pages
11 |     def start_requests(self):
12 |         for url in self.start_urls:
13 |             yield SplashRequest(url, self.parse,
14 |                endpoint='render.html',
15 |                 args={'wait': 0.5},
16 |             )
17 |     
18 |     def parse(self, response):
19 |         urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract_first()
20 |         urls = main_url + urls
21 |         self.log(urls)
22 |         for url in urls:
23 |             yield scrapy.Request(url = url, callback = self.parse_details)
24 |             
25 | 
26 |     def parse_details(self, response):
27 |        
28 |             if response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul'):
29 |               yield {         
30 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
31 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
32 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul > li::text').extract())
33 |                      }
34 |             elif response.css('div[id = JobDescription] > span[id = TrackingJobBody]'):
35 |              yield {         
36 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
37 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
38 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]').xpath(".//text()").extract()),
39 |                       'Description2' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]::text').extract())
40 |                      }


--------------------------------------------------------------------------------
/web scraping/g1/spiders/example.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/g1/spiders/example.pyc


--------------------------------------------------------------------------------
/web scraping/glassdoor data/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/glassdoor data/__init__.pyc


--------------------------------------------------------------------------------
/web scraping/glassdoor data/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class G1Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/web scraping/glassdoor data/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class G1SpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/web scraping/glassdoor data/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class G1Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/web scraping/glassdoor data/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for g1 project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'g1'
 13 | 
 14 | SPIDER_MODULES = ['g1.spiders']
 15 | NEWSPIDER_MODULE = 'g1.spiders'
 16 | 
 17 | 
 18 | SPLASH_URL = 'http://192.168.99.100:8050'
 19 | DOWNLOADER_MIDDLEWARES = {
 20 |         'scrapy_splash.SplashCookiesMiddleware': 723,
 21 |         'scrapy_splash.SplashMiddleware': 725,
 22 |         'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 23 |         }
 24 | SPIDER_MIDDLEWARES = {
 25 |         'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 
 26 |         }
 27 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 28 | #HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 29 | 
 30 | 
 31 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 32 | #USER_AGENT = 'g1 (+http://www.yourdomain.com)'
 33 | 
 34 | # Obey robots.txt rules
 35 | ROBOTSTXT_OBEY = False
 36 | 
 37 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 38 | #CONCURRENT_REQUESTS = 32
 39 | 
 40 | # Configure a delay for requests for the same website (default: 0)
 41 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 42 | # See also autothrottle settings and docs
 43 | #DOWNLOAD_DELAY = 3
 44 | # The download delay setting will honor only one of:
 45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 46 | #CONCURRENT_REQUESTS_PER_IP = 16
 47 | 
 48 | # Disable cookies (enabled by default)
 49 | #COOKIES_ENABLED = False
 50 | 
 51 | # Disable Telnet Console (enabled by default)
 52 | #TELNETCONSOLE_ENABLED = False
 53 | 
 54 | # Override the default request headers:
 55 | #DEFAULT_REQUEST_HEADERS = {
 56 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 57 | #   'Accept-Language': 'en',
 58 | #}
 59 | 
 60 | # Enable or disable spider middlewares
 61 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 62 | #SPIDER_MIDDLEWARES = {
 63 | #    'g1.middlewares.G1SpiderMiddleware': 543,
 64 | #}
 65 | 
 66 | # Enable or disable downloader middlewares
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 68 | #DOWNLOADER_MIDDLEWARES = {
 69 | #    'g1.middlewares.MyCustomDownloaderMiddleware': 543,
 70 | #}
 71 | 
 72 | # Enable or disable extensions
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 74 | #EXTENSIONS = {
 75 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 76 | #}
 77 | 
 78 | # Configure item pipelines
 79 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 80 | #ITEM_PIPELINES = {
 81 | #    'g1.pipelines.G1Pipeline': 300,
 82 | #}
 83 | 
 84 | # Enable and configure the AutoThrottle extension (disabled by default)
 85 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 86 | #AUTOTHROTTLE_ENABLED = True
 87 | # The initial download delay
 88 | #AUTOTHROTTLE_START_DELAY = 5
 89 | # The maximum download delay to be set in case of high latencies
 90 | #AUTOTHROTTLE_MAX_DELAY = 60
 91 | # The average number of requests Scrapy should be sending in parallel to
 92 | # each remote server
 93 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 94 | # Enable showing throttling stats for every response received:
 95 | #AUTOTHROTTLE_DEBUG = False
 96 | 
 97 | # Enable and configure HTTP caching (disabled by default)
 98 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 99 | #HTTPCACHE_ENABLED = True
100 | #HTTPCACHE_EXPIRATION_SECS = 0
101 | #HTTPCACHE_DIR = 'httpcache'
102 | #HTTPCACHE_IGNORE_HTTP_CODES = []
103 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
104 | 


--------------------------------------------------------------------------------
/web scraping/glassdoor data/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/glassdoor data/settings.pyc


--------------------------------------------------------------------------------
/web scraping/glassdoor data/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/web scraping/glassdoor data/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/glassdoor data/spiders/__init__.pyc


--------------------------------------------------------------------------------
/web scraping/glassdoor data/spiders/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | main_url = "https://www.glassdoor.ca"
 5 | class ExampleSpider(scrapy.Spider):
 6 |     name = 'example'
 7 |     #allowed_domains = ['https://www.glassdoor.co.in/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
 8 |     start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
 9 |     main_url = "https://www.glassdoor.ca"
10 |     #handling javascript pages
11 |     def start_requests(self):
12 |         for url in self.start_urls:
13 |             yield SplashRequest(url, self.parse,
14 |                endpoint='render.html',
15 |                 args={'wait': 0.5},
16 |             )
17 |     
18 |     def parse(self, response):
19 |         urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract_first()
20 |         urls = main_url + urls
21 |         self.log(urls)
22 |         for url in urls:
23 |             yield scrapy.Request(url = url, callback = self.parse_details)
24 |             
25 | 
26 |     def parse_details(self, response):
27 |        
28 |             if response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul'):
29 |               yield {         
30 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
31 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
32 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody] > ul > li::text').extract())
33 |                      }
34 |             elif response.css('div[id = JobDescription] > span[id = TrackingJobBody]'):
35 |              yield {         
36 |                       'Job Post' : response.css('div.opening.col-sm-12 > h1::text').extract_first(),
37 |                       'Location' : response.css('div.opening.col-sm-12 > h2::text').extract_first(),
38 |                       'Description' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]').xpath(".//text()").extract()),
39 |                       'Description2' : "\n".join(response.css('div[id = JobDescription] > span[id = TrackingJobBody]::text').extract())
40 |                      }


--------------------------------------------------------------------------------
/web scraping/glassdoor data/spiders/example.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ashishkapil/Web-scraping-job-portal-sites/d9ea377a9f191dda9a461b2d55f8aa13faa4ca0e/web scraping/glassdoor data/spiders/example.pyc


--------------------------------------------------------------------------------
/web scraping/out.csv:
--------------------------------------------------------------------------------
 1 | demo
 2 | ""
 3 | ""
 4 | demo
 5 | ""
 6 | ""
 7 | demo
 8 | ""
 9 | ""
10 | demo
11 | ""
12 | ""
13 | 


--------------------------------------------------------------------------------
/web scraping/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = g1.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = g1
12 | 


--------------------------------------------------------------------------------