├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
├── .gitignore
├── README.md
├── app.py
├── config.py
├── db_config.py
├── fbmarket
    ├── __init__.py
    ├── chromedriver.exe
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── fmarket.py
├── fbwebpage1.html
├── helper
    ├── __init__.py
    └── functions.py
├── scrapy.cfg
├── static
    └── css
    │   └── main.css
└── templates
    ├── 404.html
    ├── base.html
    ├── data_list.html
    ├── item_detail.html
    ├── item_list.html
    └── main.html


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue template
 3 | about: Describe this issue template's purpose here.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv/*
 2 | *.csv
 3 | *.xlsx
 4 | *.json
 5 | .idea/*
 6 | *.jpg
 7 | *.png
 8 | *.yml
 9 | *.pyc
10 | scrapy/project.egg-info/*
11 | scrapy/build/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Facebook Marketplace Scraper
2 | This script will scrape the product data with given search keywords.
3 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, redirect, request, url_for
 2 | from flask_mysqldb import MySQL
 3 | from helper import functions
 4 | 
 5 | app = Flask(__name__)
 6 | 
 7 | app.config['MYSQL_HOST'] = 'localhost'
 8 | app.config['MYSQL_USER'] = 'root'
 9 | app.config['MYSQL_PASSWORD'] = ''
10 | app.config['MYSQL_DB'] = 'fbmarketdb'
11 | 
12 | mysql = MySQL(app)
13 | 
14 | 
15 | @app.route('/', methods=['GET', 'POST'])
16 | def index():
17 |     return render_template('main.html')
18 | 
19 | 
20 | @app.route('/list', methods=['GET', 'POST'])
21 | def archive_list():
22 |     return render_template('data_list.html')
23 | 
24 | 
25 | @app.route('/item_list', methods=['GET', 'POST'])
26 | def item_list():
27 |     # form_data = functions.get_form_data()
28 |     fb_items = get_all_fb_items()
29 |     # functions.run_spider(form_data)
30 |     if request.method == 'POST':
31 |         form_data = functions.get_form_data()
32 |         functions.run_spider(form_data)
33 |         return redirect(url_for('item_list'))
34 |     else:
35 |         return render_template('item_list.html', data=fb_items)
36 | 
37 | 
38 | @app.errorhandler(404)
39 | def page_not_found(e):
40 |     return render_template('404.html'), 404
41 | 
42 | 
43 | def get_all_fb_items():
44 |     cur = mysql.connection.cursor()
45 |     cur.execute("SELECT * FROM fb_item")
46 |     fb_data = cur.fetchall()
47 |     cur.close()
48 |     return fb_data
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     app.run(debug=True)
53 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def project_dir():
5 |     filepath = os.path.abspath(__file__)
6 |     main_dir = os.path.dirname(filepath)
7 |     return main_dir
8 | 


--------------------------------------------------------------------------------
/db_config.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | from flask_mysqldb import MySQL
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | app.config['MYSQL_HOST'] = 'localhost'
 7 | app.config['MYSQL_USER'] = 'root'
 8 | app.config['MYSQL_PASSWORD'] = ''
 9 | app.config['MYSQL_DB'] = 'fbmarketdb'
10 | 
11 | mysql = MySQL(app)
12 | 
13 | 
14 | def get_all_fb_items():
15 |     cur = mysql.connection.cursor()
16 |     cur.execute("SELECT * FROM fb_items")
17 |     fb_data = cur.fetchall()
18 |     cur.close()
19 |     return fb_data
20 | 


--------------------------------------------------------------------------------
/fbmarket/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdubey07/FacebookMarketplaceScraper/ab6a7f3a2f2d29f1a8ba449fc5834450f66df1fe/fbmarket/__init__.py


--------------------------------------------------------------------------------
/fbmarket/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdubey07/FacebookMarketplaceScraper/ab6a7f3a2f2d29f1a8ba449fc5834450f66df1fe/fbmarket/chromedriver.exe


--------------------------------------------------------------------------------
/fbmarket/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FbmarketItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     name = scrapy.Field()
14 |     price = scrapy.Field()
15 |     category = scrapy.Field()
16 |     location = scrapy.Field()
17 |     search_term = scrapy.Field()
18 |     img_url = scrapy.Field()
19 |     item_url = scrapy.Field()
20 |     # current_date = scrapy.Field()
21 |     # slot_number = scrapy.Field()
22 |     # pass
23 | 


--------------------------------------------------------------------------------
/fbmarket/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | from scrapy.http import HtmlResponse
 10 | from selenium import webdriver
 11 | from selenium.webdriver.common.by import By
 12 | from selenium.webdriver.support.ui import WebDriverWait
 13 | from selenium.webdriver.support import expected_conditions as EC
 14 | import time
 15 | 
 16 | options = webdriver.ChromeOptions()
 17 | options.add_argument('headless')
 18 | options.add_argument('window-size=720x600')
 19 | options.add_argument("--log-level=3")
 20 | driver = webdriver.Chrome("G:/Projects/MyPython/flaskapp/fbmarket/chromedriver.exe", chrome_options=options)
 21 | 
 22 | 
 23 | # for mobile emulation
 24 | # mobile_emulation = {"deviceName": "Nexus 5"}
 25 | # chrome_options = webdriver.ChromeOptions()
 26 | # chrome_options.add_argument('headless')
 27 | # chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
 28 | 
 29 | # driver = webdriver.Chrome("G:/Projects/MyPython/mfbmarketplace/mfbmarketplace/fbmarket/chromedriver.exe", desired_capabilities=chrome_options.to_capabilities())
 30 | 
 31 | 
 32 | class FbmarketSpiderMiddleware(object):
 33 |     # Not all methods need to be defined. If a method is not defined,
 34 |     # scrapy acts as if the spider middleware does not modify the
 35 |     # passed objects.
 36 | 
 37 |     @classmethod
 38 |     def from_crawler(cls, crawler):
 39 |         # This method is used by Scrapy to create your spiders.
 40 |         s = cls()
 41 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 42 |         return s
 43 | 
 44 |     def process_spider_input(self, response, spider):
 45 |         # Called for each response that goes through the spider
 46 |         # middleware and into the spider.
 47 | 
 48 |         # Should return None or raise an exception.
 49 |         return None
 50 | 
 51 |     def process_spider_output(self, response, result, spider):
 52 |         # Called with the results returned from the Spider, after
 53 |         # it has processed the response.
 54 | 
 55 |         # Must return an iterable of Request, dict or Item objects.
 56 |         for i in result:
 57 |             yield i
 58 | 
 59 |     def process_spider_exception(self, response, exception, spider):
 60 |         # Called when a spider or process_spider_input() method
 61 |         # (from other spider middleware) raises an exception.
 62 | 
 63 |         # Should return either None or an iterable of Request, dict
 64 |         # or Item objects.
 65 |         pass
 66 | 
 67 |     def process_start_requests(self, start_requests, spider):
 68 |         # Called with the start requests of the spider, and works
 69 |         # similarly to the process_spider_output() method, except
 70 |         # that it doesn’t have a response associated.
 71 | 
 72 |         # Must return only requests (not items).
 73 |         for r in start_requests:
 74 |             yield r
 75 | 
 76 |     def spider_opened(self, spider):
 77 |         spider.logger.info('Spider opened: %s' % spider.name)
 78 | 
 79 | 
 80 | class FbmarketDownloaderMiddleware(object):
 81 |     # Not all methods need to be defined. If a method is not defined,
 82 |     # scrapy acts as if the downloader middleware does not modify the
 83 |     # passed objects.
 84 | 
 85 |     @classmethod
 86 |     def from_crawler(cls, crawler):
 87 |         # This method is used by Scrapy to create your spiders.
 88 |         s = cls()
 89 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 90 |         return s
 91 | 
 92 |     def process_request(self, request, spider):
 93 |         driver.get(request.url)
 94 |         SCROLL_PAUSE_TIME = 25
 95 |         scroll_times = 0
 96 | 
 97 |         # Get scroll height
 98 |         last_height = driver.execute_script("return document.body.scrollHeight")
 99 | 
100 |         while True:
101 |             # Scroll down to bottom
102 |             scroll_times = scroll_times + 1
103 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
104 | 
105 |             # Wait to load page
106 |             time.sleep(SCROLL_PAUSE_TIME)
107 | 
108 |             # Calculate new scroll height and compare with last scroll height
109 |             new_height = driver.execute_script("return document.body.scrollHeight")
110 |             if new_height == last_height or scroll_times >= 2:
111 |                 break
112 |             last_height = new_height
113 | 
114 |         body = driver.page_source
115 |         return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
116 | 
117 |     def process_response(self, request, response, spider):
118 |         # Called with the response returned from the downloader.
119 | 
120 |         # Must either;
121 |         # - return a Response object
122 |         # - return a Request object
123 |         # - or raise IgnoreRequest
124 |         return response
125 | 
126 |     def process_exception(self, request, exception, spider):
127 |         # Called when a download handler or a process_request()
128 |         # (from other downloader middleware) raises an exception.
129 | 
130 |         # Must either:
131 |         # - return None: continue processing this exception
132 |         # - return a Response object: stops process_exception() chain
133 |         # - return a Request object: stops process_exception() chain
134 |         pass
135 | 
136 |     def spider_opened(self, spider):
137 |         spider.logger.info('Spider opened: %s' % spider.name)
138 | 


--------------------------------------------------------------------------------
/fbmarket/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymysql
 9 | 
10 | 
11 | class FbmarketPipeline(object):
12 |     def __init__(self):
13 |         self.conn = pymysql.connect(host='localhost', user='root', password='', database='fbmarketdb')
14 |         self.cursor = self.conn.cursor()
15 |         # self.conn.autocommit(True)
16 | 
17 |     def process_item(self, item, spider):
18 |         self.insert_item(item)
19 |         return item
20 | 
21 |     def insert_item(self, item):
22 |         sql = "INSERT INTO fb_item (item_name, item_price, " \
23 |               "item_category, item_location, item_search_term, item_img, item_url) " \
24 |               "VALUES (%s, %s, %s, %s, %s, %s, %s)"
25 |         self.cursor.execute(sql, (
26 |             item['name'],
27 |             item['price'],
28 |             item['category'],
29 |             item['location'],
30 |             item['search_term'],
31 |             item['img_url'],
32 |             item['item_url'],
33 |         ))
34 |         self.conn.commit()
35 | 


--------------------------------------------------------------------------------
/fbmarket/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for fbmarket project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'fbmarket'
 13 | 
 14 | SPIDER_MODULES = ['fbmarket.spiders']
 15 | NEWSPIDER_MODULE = 'fbmarket.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | CONCURRENT_REQUESTS = 8
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | CONCURRENT_REQUESTS_PER_DOMAIN = 8
 32 | CONCURRENT_REQUESTS_PER_IP = 8
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | # DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | # }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'fbmarket.middlewares.FbmarketSpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 54 | DOWNLOADER_MIDDLEWARES = {
 55 |     'fbmarket.middlewares.FbmarketDownloaderMiddleware': 543,
 56 |     'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
 57 |     'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
 58 | }
 59 | 
 60 | # Enable or disable extensions
 61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 62 | # EXTENSIONS = {
 63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 64 | # }
 65 | 
 66 | # Configure item pipelines
 67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 68 | ITEM_PIPELINES = {
 69 |     'fbmarket.pipelines.FbmarketPipeline': 300,
 70 | }
 71 | 
 72 | # Enable and configure the AutoThrottle extension (disabled by default)
 73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 74 | # AUTOTHROTTLE_ENABLED = True
 75 | # The initial download delay
 76 | # AUTOTHROTTLE_START_DELAY = 5
 77 | # The maximum download delay to be set in case of high latencies
 78 | # AUTOTHROTTLE_MAX_DELAY = 60
 79 | # The average number of requests Scrapy should be sending in parallel to
 80 | # each remote server
 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 82 | # Enable showing throttling stats for every response received:
 83 | # AUTOTHROTTLE_DEBUG = False
 84 | 
 85 | # Enable and configure HTTP caching (disabled by default)
 86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 87 | # HTTPCACHE_ENABLED = True
 88 | # HTTPCACHE_EXPIRATION_SECS = 0
 89 | # HTTPCACHE_DIR = 'httpcache'
 90 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 92 | ROTATING_PROXY_LIST = [
 93 |     # 'http://user797:scrp1ng@107.175.247.7:80',
 94 |     # 'http://user797:scrp1ng@107.175.90.224:80',
 95 |     'http://user797:scrp1ng@23.95.239.51:80',
 96 |     # 'http://user797:scrp1ng@23.95.224.231:80',
 97 |     # 'http://user797:scrp1ng@23.95.219.228:80',
 98 | 
 99 |     # 'http://user797:scrp1ng@45.13.230.67:80',
100 |     # 'http://user797:scrp1ng@45.13.230.17:80',
101 |     # 'http://user797:scrp1ng@176.119.24.133:80',
102 |     # 'http://user797:scrp1ng@176.119.24.50:80',
103 |     # 'http://user797:scrp1ng@176.119.24.191:80',
104 |     # 'http://username:pass@IP:port'
105 | ]
106 | 


--------------------------------------------------------------------------------
/fbmarket/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/fbmarket/spiders/fmarket.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from ..items import FbmarketItem
  4 | from urllib.parse import urlencode
  5 | from urllib.parse import quote
  6 | from helper import functions
  7 | 
  8 | 
  9 | class FmarketSpider(scrapy.Spider):
 10 |     name = 'fmarket'
 11 |     allowed_domains = ['facebook.com']
 12 |     start_urls = ['https://m.facebook.com/marketplace']
 13 | 
 14 |     location = 'delhi'
 15 |     search_query = 'new girls jacket'
 16 |     radius_km = 1
 17 |     category = ''
 18 | 
 19 |     form_data = ''
 20 | 
 21 |     # query_parameter = {'query': search_query, 'radius_in_km': radius_km}
 22 | 
 23 |     def start_requests(self):
 24 |         if self.f_data:
 25 |             self.form_data = self.f_data
 26 |             self.form_data = self.form_data.split(',')
 27 |             self.location = self.form_data[0]
 28 |             self.category = self.form_data[1]
 29 |             self.search_query = self.form_data[2]
 30 |             self.radius_km = self.form_data[3]
 31 | 
 32 |         query_parameter = {'query': self.form_data[2], 'radius_in_km': self.form_data[3]}
 33 |         # ab_url = self.absolute_url(self.location, self.category, self.queryParameter)
 34 |         ab_url = self.absolute_url(self.form_data[0], self.form_data[1], query_parameter)
 35 |         print(self.form_data)
 36 | 
 37 |         yield scrapy.Request(url=ab_url, callback=self.parse)
 38 | 
 39 |     def parse(self, response):
 40 |         # print(response.body)
 41 |         # filename = response.url.split("/")[-1] + '.html'
 42 |         items = FbmarketItem()
 43 |         filename = 'fbwebpage1' + '.html'
 44 |         with open(filename, 'wb') as f:
 45 |             f.write(response.body)
 46 | 
 47 |         for product in response.css('div._a5o'):
 48 |             product_url = product.css("a::attr(href)").get()
 49 |             product_name = product.css('div:last-child::text').extract()
 50 |             product_price = product.css('a div:last-child span::text').extract()
 51 | 
 52 |             if product_price:
 53 |                 product_price = self.rm_whilespace(product_price)
 54 | 
 55 |             if product_name:
 56 |                 product_name = self.rm_whilespace(product_name)
 57 | 
 58 |             if product_url:
 59 |                 product_url = response.urljoin(product_url)
 60 |                 product_url = product_url.split('?')[0]
 61 |                 product_url = product_url.replace('m.', '')
 62 | 
 63 |             if self.category:
 64 |                 search_category = self.category.strip()
 65 |             else:
 66 |                 search_category = 'NA'
 67 | 
 68 |             if self.location:
 69 |                 search_loc = self.location.strip()
 70 |             else:
 71 |                 search_loc = 'NA'
 72 | 
 73 |             if self.search_query:
 74 |                 search_term = self.search_query.strip()
 75 |             else:
 76 |                 search_term = 'NA'
 77 | 
 78 |             if product_price is None or product_url is None or product_name is None:
 79 |                 pass
 80 | 
 81 |             if '/item/' not in product_url:
 82 |                 pass
 83 | 
 84 |             item_tem_img = 'https://5.imimg.com/data5/PJ/DI/MY-3877854/round-neck-plain-tshirt-with-multi-color-design-500x500.png'
 85 | 
 86 |             items['name'] = product_name
 87 |             items['price'] = product_price
 88 |             items['category'] = search_category
 89 |             items['location'] = search_loc
 90 |             items['search_term'] = search_term
 91 |             items['img_url'] = item_tem_img
 92 |             items['item_url'] = product_url
 93 |             # items['current_date'] = product_name
 94 |             # items['slot_number'] = product_name
 95 | 
 96 |             yield items
 97 | 
 98 |     @staticmethod
 99 |     def rm_whilespace(query_term):
100 |         if query_term:
101 |             None_ = [nn_.replace('\n', '') for nn_ in query_term]
102 |             None_ = [nn_.strip() for nn_ in None_]
103 |             None_ = filter(None, None_)
104 |             None_ = ' '.join(None_)
105 |             ret_value = None_
106 |             return ret_value
107 |         # query_term = query_term.encode('ascii', 'xmlcharrefreplace').decode('utf8')
108 |         return query_term
109 | 
110 |     def absolute_url(self, location, category, query_parameter):
111 |         url = self.start_urls[0]
112 |         fb_query = urlencode(query_parameter)
113 |         if location:
114 |             url = url + '/' + quote(location)
115 |         if category:
116 |             url = url + '/' + quote(category)
117 |         url = url + '/?' + fb_query
118 |         return url
119 | 


--------------------------------------------------------------------------------
/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdubey07/FacebookMarketplaceScraper/ab6a7f3a2f2d29f1a8ba449fc5834450f66df1fe/helper/__init__.py


--------------------------------------------------------------------------------
/helper/functions.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | import csv
 3 | import json
 4 | import os
 5 | import subprocess
 6 | import config
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | 
11 | def get_table_data_by_csv():
12 |     file_path = 'static/output/data2.csv'
13 |     table_data = []
14 |     with open(file_path, 'rt', encoding="utf8")as f:
15 |         data = csv.reader(f)
16 |         headers = next(data, None)
17 |         garbage = next(data, None)
18 |         for row in data:
19 |             table_data.append(row)
20 |     return table_data
21 | 
22 | 
23 | def test():
24 |     filepath = os.path.join(config.project_dir(), 'static\\output\\' 'data2.csv')
25 | 
26 | 
27 | def run_spider(form_data):
28 |     spider_name = "fmarket"
29 |     cat = form_data['cat']
30 |     location = form_data['location']
31 |     print(location + cat)
32 |     # filepath = os.path.join(config.project_dir(), 'static\\output\\' 'data2.json')
33 |     subprocess.call(
34 |         ['scrapy', 'crawl', spider_name, '-a', 'f_data='+form_data['location'] + ',' + form_data['cat'] + ',' +
35 |          form_data['search_term']+','+form_data['radius']])
36 | 
37 |     # subprocess.check_output(['scrapy', 'crawl', spider_name, "-o", 'xyz1.csv'])
38 | 
39 | 
40 | def get_table_data_by_json():
41 |     # file_path = 'static/output/abc.json'
42 |     filepath = os.path.join(config.project_dir(), 'static\\output\\' 'data2.json')
43 |     table_data = []
44 |     with open(filepath, 'rt', encoding="utf8") as f:
45 |         data = json.load(f)
46 |         headers = next(data, None)
47 |         # garbage = next(data, None)
48 |         for row in data:
49 |             table_data.append(row)
50 |     return table_data
51 | 
52 | 
53 | def get_form_data():
54 |     cat = request.form['category']
55 |     location = request.form['loc']
56 |     search_term = request.form['skey']
57 |     radius_in_km = request.form['radius']
58 |     form_data = {
59 |         'cat': cat,
60 |         'location': location,
61 |         'search_term': search_term,
62 |         'radius': radius_in_km,
63 |         'result': 3
64 |     }
65 |     return form_data
66 | 
67 |     # test()
68 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = fbmarket.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = fbmarket
12 | 


--------------------------------------------------------------------------------
/static/css/main.css:
--------------------------------------------------------------------------------
1 | .small-img{
2 |     width: 100px;
3 |     height: auto;
4 | }


--------------------------------------------------------------------------------
/templates/404.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}
 3 |     404 Page Not Found
 4 | {% endblock %}
 5 | {% block content %}
 6 |     <div class="row" style="margin-top: 20px;">
 7 |         <div class="col-sm-12">
 8 |             <h1>404- Page not found!!</h1>
 9 |             <h4>Please navigate to other url of page!</h4>
10 | 
11 |         </div>
12 |     </div>
13 | 
14 | {% endblock %}


--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 | <head>
 4 |     <!-- Required meta tags -->
 5 |     <meta charset="utf-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 7 | 
 8 |     <!-- Bootstrap CSS -->
 9 |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css"
10 |           integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">
11 |     <link rel="stylesheet" href="../static/css/main.css">
12 | 
13 |     <title>
14 |         {% block title %}
15 | 
16 |         {% endblock %}
17 |     </title>
18 | </head>
19 | <body>
20 | <nav class="navbar navbar-expand-lg navbar-light" style="background-color: #e3f2fd;">
21 |         <a class="navbar-brand" href="{{ url_for('index') }}">Facebook Market Place Data</a>
22 |         <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarText"
23 |                 aria-controls="navbarText" aria-expanded="false" aria-label="Toggle navigation">
24 |             <span class="navbar-toggler-icon"></span>
25 |         </button>
26 |         <div class="collapse navbar-collapse" id="navbarText">
27 |             <ul class="navbar-nav mr-auto">
28 |                 <li class="nav-item active">
29 |                     <a class="nav-link" href="{{ url_for('index') }}">Home</a>
30 |                 </li>
31 |                 <li class="nav-item">
32 |                     <a class="nav-link" href="{{ url_for('item_list') }}">List Of Items - Current</a>
33 |                 </li>
34 |                 <li class="nav-item">
35 |                     <a class="nav-link" href="{{ url_for('archive_list') }}">Old Archive List</a>
36 |                 </li>
37 |             </ul>
38 |         </div>
39 |     </nav>
40 | <div class="container">
41 | 
42 |     {% block content %}
43 | 
44 |     {% endblock %}
45 | </div>
46 | 
47 | 
48 | <script src="https://code.jquery.com/jquery-3.4.1.slim.min.js"
49 |         integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n"
50 |         crossorigin="anonymous"></script>
51 | <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"
52 |         integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo"
53 |         crossorigin="anonymous"></script>
54 | <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js"
55 |         integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6"
56 |         crossorigin="anonymous"></script>
57 | </body>
58 | </html>


--------------------------------------------------------------------------------
/templates/data_list.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}
 3 |     Home
 4 | {% endblock %}
 5 | {% block content %}
 6 |     <div class="row" style="margin-top: 20px;">
 7 |         <div class="col-sm-12">
 8 |             <h2>Scraped Item data List</h2>
 9 |             <h4>Please click on any list to see scraped result</h4>
10 | 
11 |             <div class="list-group">
12 | {#                <a href="#" class="list-group-item list-group-item-action">First item</a>#}
13 |                 <a href="#" class="list-group-item d-flex list-group-item-action justify-content-between align-items-center">
14 |                     Item List 1
15 |                     <span class="badge badge-primary badge-pill">12</span>
16 |                 </a>
17 |                 <a href="#" class="list-group-item d-flex list-group-item-action justify-content-between align-items-center">
18 |                     Item List 2
19 |                     <span class="badge badge-primary badge-pill">50</span>
20 |                 </a>
21 |                 <a href="#" class="list-group-item d-flex list-group-item-action justify-content-between align-items-center">
22 |                     Item List 3
23 |                     <span class="badge badge-primary badge-pill">99</span>
24 |                 </a>
25 |             </div>
26 | 
27 |         </div>
28 |     </div>
29 | 
30 | {% endblock %}


--------------------------------------------------------------------------------
/templates/item_detail.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 | </head>
 7 | <body>
 8 | 
 9 | </body>
10 | </html>


--------------------------------------------------------------------------------
/templates/item_list.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}
 3 |     List of Fb Items
 4 | {% endblock %}
 5 | {% block content %}
 6 |     <div class="row">
 7 |         <div class="col">
 8 |             <h3>List of Scraped Item List</h3>
 9 | {#            <p>Showing result for: {{ form_data }}</p>#}
10 |             <div class="table-responsive">
11 |                 <table class="table table-bordered">
12 |                     <thead>
13 |                     <tr>
14 |                         <th>S.No</th>
15 |                         <th>Product Name</th>
16 |                         <th>Price</th>
17 |                         <th>Category</th>
18 |                         <th>Location</th>
19 |                         <th>Search Term</th>
20 |                         <th>Image</th>
21 |                         <th>Url</th>
22 |                         <th>Date</th>
23 |                         <th>View Detail</th>
24 | 
25 |                     </tr>
26 |                     </thead>
27 |                     <tbody>
28 |                     {% for row in data %}
29 | 
30 |                         <tr>
31 |                             <td>{{ row[0] }}</td>
32 |                             <td>{{ row[1] }}</td>
33 |                             <td>{{ row[2] }}</td>
34 |                             <td>{{ row[3] }}</td>
35 |                             <td>{{ row[4] }}</td>
36 |                             <td>{{ row[5] }}</td>
37 |                             <td><img src="{{ row[6] }}" class="small-img rounded" alt="Cinque Terre"></td>
38 |                             <td><a href="{{ row[7] }}" target="_blank">Item Url</a></td>
39 |                             <td>07/03/2020</td>
40 |                             <td><a href="{{ row[7] }}" class="btn btn-info" role="button">View more</a></td>
41 |                         </tr>
42 | 
43 |                     {% endfor %}
44 |                     </tbody>
45 |                 </table>
46 |             </div>
47 |         </div>
48 |     </div>
49 | {% endblock %}
50 | 


--------------------------------------------------------------------------------
/templates/main.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}
 3 |     Home
 4 | {% endblock %}
 5 | {% block content %}
 6 |     <div class="row" style="margin-top: 20px;">
 7 |         <div class="col-sm-12">
 8 |             <h4>Please use below fields to extract list of result from FB marketplace</h4>
 9 |             <form action="{{ url_for('item_list') }}" method="post">
10 |                 <div class="form-row">
11 |                     <div class="col-md-3">
12 |                         <label for="category">Category</label>
13 |                         <input type="text" class="form-control" placeholder="Enter Category" name="category">
14 |                     </div>
15 |                     <div class="col-md-3">
16 |                         <label for="loc">Location</label>
17 |                         <input type="text" class="form-control" placeholder="Enter location like nyc" name="loc">
18 |                     </div>
19 |                     <div class="col-md-3">
20 |                         <label for="skey">Search Keyword</label>
21 |                         <input type="text" class="form-control" placeholder="Enter search keyword" name="skey">
22 |                     </div>
23 |                     <div class="col-md-3">
24 |                         <label for="radius">Radius in KM:</label>
25 |                         <input type="text" class="form-control" placeholder="Enter radius in KM" name="radius">
26 |                     </div>
27 |                 </div>
28 | 
29 |                 <button type="submit" class="btn btn-primary" style="margin-top: 10px;">Extract</button>
30 |             </form>
31 |         </div>
32 |     </div>
33 | 
34 | {% endblock %}


--------------------------------------------------------------------------------