├── data
    ├── README.md
    ├── Pie_authors.png
    ├── Pie_comments.png
    ├── Posts_Years.png
    └── wc_post_tags.png
├── Schneier on Security
    ├── archived.zip
    ├── data
    │   ├── README.md
    │   └── sample_articles.csv
    └── schneier.py
├── README.md
├── util.py
├── BoingBoing
    ├── README.md
    ├── boingboing_comments.py
    └── boingboing_scraping.py
└── LICENSE


/data/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/Pie_authors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Pie_authors.png


--------------------------------------------------------------------------------
/data/Pie_comments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Pie_comments.png


--------------------------------------------------------------------------------
/data/Posts_Years.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Posts_Years.png


--------------------------------------------------------------------------------
/data/wc_post_tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/wc_post_tags.png


--------------------------------------------------------------------------------
/Schneier on Security/archived.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/Schneier on Security/archived.zip


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web-Scraping
2 | ### BoingBoing
3 | The posts fetched are filtered based on tags ['facebook', 'social media'] (parameterized). There are a total of 631 posts <br>
4 | and 10299 comments associated with it.
5 | 
6 | 


--------------------------------------------------------------------------------
/Schneier on Security/data/README.md:
--------------------------------------------------------------------------------
1 | For the complete set of articles and comments till 15th Mar, 2020, refer to this link for the csv file:
2 | 
3 | https://drive.google.com/drive/folders/1CCeREAh7C7Htjpa-iHoit2cSbYrS-rSA?usp=sharing
4 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility function to connect to PostgreSQL server.
 3 | """
 4 | import psycopg2
 5 | 
 6 | HOST = "localhost"
 7 | USER = "postgres"
 8 | PASSWORD = "xxxxxxxxxxx"
 9 | 
10 | 
11 | def connect_to_database_server(dbname):
12 |     """
13 |     Connects to PostgreSQL server database 'dbname' and
14 |     returns a connection object and cursor.
15 |     """
16 |     try:
17 |         conn = psycopg2.connect(host=HOST, database=dbname, user=USER, password=PASSWORD)
18 |         cur = conn.cursor()
19 |         return [conn, cur]
20 |     except psycopg2.OperationalError:
21 |         return -1
22 | 


--------------------------------------------------------------------------------
/BoingBoing/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # BoingBoing - A Directory of Mostly Wonderful Things: 
 3 | ### (https://boingboing.net/)
 4 | 
 5 | #### (i) No. of posts fetched from years 2005 - 2018.
 6 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Posts_Years.png)
 7 | 
 8 | #### (ii) The top 5 authors based on the number of published posts.
 9 | The top 5 authors contribute to approx. 90% of the overall number of published posts.
10 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Pie_authors.png)
11 | 
12 | #### (iii) The tags that appear frequently with the filtered posts.
13 | (Note: Eliminated ['facebook', 'social media'], as the posts were scraped exclusively for those tags.
14 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/wc_post_tags.png)
15 | 
16 | #### (iv) The top 10 users based on the number of comments.
17 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Pie_comments.png)
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Cavin Dsouza
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Schneier on Security/data/sample_articles.csv:
--------------------------------------------------------------------------------
1 | id,url,title,body,tags,posted_datetime
2 | a000808,https://www.schneier.com/blog/archives/2006/04/announcing_movi.html,Announcing:  Movie-Plot Threat Contest,"NOTE: If you have a blog, please spread the word. For a while now, I have been writing about our penchant for ""movie-plot threats"": terrorist fears based on very specific attack scenarios.  Terrorists with crop dusters, terrorists exploding baby carriages in subways, terrorists filling school buses with explosives -- these are all movie-plot threats.  They're good for scaring people, but it's just silly to build national security policy around them. But if we're going to worry about unlikely attacks, why can't they be exciting and innovative ones?  If Americans are going to be scared, shouldn't they be scared of things that are really scary?  ""Blowing up the Super Bowl"" is a movie plot to be sure, but it's not a very good movie.  Let's kick this up a notch. It is in this spirit I announce the (possibly First) Movie-Plot Threat Contest.  Entrants are invited to submit the most unlikely, yet still plausible, terrorist attack scenarios they can come up with. Your goal: cause terror.  Make the American people notice.  Inflict lasting damage on the U.S. economy.  Change the political landscape, or the culture.  The more grandiose the goal, the better. Assume an attacker profile on the order of 9/11: 20 to 30 unskilled people, and about $500,000 with which to buy skills, equipment, etc. Post your movie plots here on this blog. Judging will be by me, swayed by popular acclaim in the blog comments section.  The prize will be an autographed copy of Beyond Fear.  And if I can swing it, a phone call with a real live movie producer. Beyond Fear Entries close at the end of the month -- April 30 -- so Crypto-Gram readers can also play. Crypto-Gram This is not an April Fool's joke, although it's in the spirit of the season.  The purpose of this contest is absurd humor, but I hope it also makes a point.  Terrorism is a real threat, but we're not any safer through security measures that require us to correctly guess what the terrorists are going to do next. Good luck. EDITED TO ADD (4/4):  There are hundreds of ideas here. EDITED TO ADD (4/22):  Update here. Two clicks for more privacy: The Facebook Like button will be enabled once you click here. No data is loaded from Facebook until you enable the button.  Click the [i] button for more information.not connected to FacebookTwo clicks for more privacy: The Tweet this button will be enabled once you click here. No data is loaded from Twitter until you enable the button.  Click the [i] button for more information.not connected to TwitterTwo clicks for more privacy: The Google+ button will be enabled once you click here. No data is loaded from Google until you enable the button.  Click the [i] button for more information.not connected to Google+If you click to activate the share buttons, data will be loaded from a third party (Facebook, Twitter, Google), allowing them to track your visit to schneier.com. For more details click the [i] button.SettingsPermanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons: Facebook Twitter Google+","contests, fear, movie-plot threat contests, movie-plot threats, terrorism",2006-04-01 09:35:00
3 | 


--------------------------------------------------------------------------------
/BoingBoing/boingboing_comments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | URL: https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html
  3 | The page setup is such that, by default,
  4 | approx. 19 <div class="topic-post clearfix regular"></div> tags on boingboing comments are
  5 | loaded at startup and the remaining get loaded once the page is manually scrolled down.
  6 | We use selenium web-driver to achieve manual scrolling.
  7 | We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox
  8 | or the chromedriver (latest release 2.3.6 -->
  9 | https://chromedriver.storage.googleapis.com/index.html?path=2.36/)
 10 | Note: the executable driver file must be in the same directory as the .py file.
 11 | """
 12 | from datetime import datetime
 13 | import http
 14 | import re
 15 | import time
 16 | import math
 17 | 
 18 | from bs4 import BeautifulSoup
 19 | from selenium.webdriver.support.ui import WebDriverWait
 20 | from selenium.webdriver.support import expected_conditions as ec
 21 | from selenium.webdriver.common.by import By
 22 | 
 23 | SCREEN_HEIGHT_IN_PIXELS = 1080
 24 | COMMENTS_SCREEN_SIZE = 3
 25 | SCROLL_WAIT_TIME = 1
 26 | 
 27 | # Fixing the 'IncompleteRead' bug using http
 28 | # https://stackoverflow.com/questions/14149100/incompleteread-using-httplib
 29 | http.client.HTTPConnection._http_vsn = 10
 30 | http.client.HTTPConnection._http_vsn_str = "HTTP/1.0"
 31 | 
 32 | 
 33 | def fetch_comment_info(browser, url, postno, cur, delay=100):
 34 |     """
 35 |     Fetches user comments in 'url'.
 36 |     """
 37 | 
 38 |     comments = {}
 39 |     # indicates presence of div_class_share but no a_class_bbs
 40 |     try:
 41 |         # Added timeout for the error: http.client.RemoteDisconnected:
 42 |         # Remote end closed connection without response
 43 |         browser.set_page_load_timeout(200)
 44 |         browser.get(url)
 45 |     except http.client.RemoteDisconnected:
 46 |         return comments
 47 | 
 48 |     WebDriverWait(browser, delay).until(
 49 |         ec.presence_of_element_located((By.CLASS_NAME, "container"))
 50 |     )
 51 | 
 52 |     soup = BeautifulSoup(browser.page_source, "html.parser")
 53 | 
 54 |     # Replies, Views, Users, Likes and Links
 55 |     topic_str = ["replies", "view", "user", "like", "link"]
 56 |     topic_map = [0] * len(topic_str)
 57 | 
 58 |     div_class_topicmap = soup.find("div", attrs={"class": "topic-map"})
 59 |     if div_class_topicmap:
 60 |         li_all = div_class_topicmap.find_all("li")
 61 |         for li_tag in li_all:
 62 |             li_text = li_tag.text.strip()
 63 |             span_class_number = li_tag.find("span")
 64 |             str_found = False
 65 |             for i in topic_str:
 66 |                 if i in li_text:
 67 |                     str_found = True
 68 |                     break
 69 | 
 70 |             if str_found and span_class_number:
 71 |                 if "k" in span_class_number.text:
 72 |                     if "." in span_class_number.text:
 73 |                         tmp = re.findall(r"\d+\.\d+", span_class_number.text)[0]
 74 |                     else:
 75 |                         tmp = re.findall(r"\d+", span_class_number.text)[0]
 76 | 
 77 |                     num = int(float(tmp) * 1000)
 78 |                 else:
 79 |                     num = int(span_class_number.text)
 80 | 
 81 |                 for i, _ in enumerate(topic_str):
 82 |                     if topic_str[i] in li_text:
 83 |                         topic_map[i] = num
 84 | 
 85 |     # Replies, Views, Users, Likes and Links
 86 | 
 87 |     tmp = 0
 88 |     query = "UPDATE posts SET c_page_url = %s, replies = %s, views = %s, \
 89 |                 users = %s, likes = %s, links = %s WHERE postno = %s;"
 90 |     if topic_map[0] >= 1:
 91 |         tmp = topic_map[0] - 1
 92 |     data = (url, tmp, topic_map[1], topic_map[2], topic_map[3], topic_map[4], postno)
 93 | 
 94 |     cur.execute(query, data)
 95 | 
 96 |     scrolls = math.ceil(topic_map[0] / COMMENTS_SCREEN_SIZE)
 97 | 
 98 |     for i in range(scrolls):
 99 |         soup = BeautifulSoup(browser.page_source, "html.parser")
100 |         div_class_comment = soup.find_all(
101 |             "div", attrs={"class": "topic-post clearfix regular"}
102 |         ) + soup.find_all(
103 |             "div",
104 |             attrs={
105 |                 "class": "topic-post clearfix topic-owner \
106 |                                                  group-editors regular"
107 |             },
108 |         )
109 | 
110 |         comm_no = 1
111 |         for dc_comment in div_class_comment:
112 |             div_class_user_card = dc_comment.find("div", attrs={"class": "names trigger-user-card"})
113 |             if div_class_user_card:
114 |                 span_class_firstusername = dc_comment.find("span")
115 |                 if span_class_firstusername:
116 |                     postedby = span_class_firstusername.find("a").text
117 | 
118 |                     post_date = dc_comment.find("div", attrs={"class": "post-info post-date"})
119 |                     a_class_post_date = post_date.find("a", attrs={"class": "post-date"})
120 |                     posteddate = a_class_post_date.find("span")["title"]
121 |                     div_class_cooked = dc_comment.find("div", attrs={"class": "cooked"})
122 |                     comm_text = div_class_cooked.text.strip().replace("\n", "").replace("\r", "")
123 | 
124 |                     dict_primary_key = postedby + " " + posteddate + " " + comm_text
125 | 
126 |                     if dict_primary_key not in comments:
127 |                         comments[dict_primary_key] = {}
128 |                         comments[dict_primary_key]["postedby"] = postedby
129 |                         comments[dict_primary_key]["date"] = datetime.strptime(
130 |                             posteddate, "%b %d, %Y %I:%M %p"
131 |                         ).date()
132 | 
133 |                         comments[dict_primary_key]["comm_no"] = comm_no
134 |                         comments[dict_primary_key]["comm_text"] = comm_text
135 | 
136 |                         div_class_actions = dc_comment.find("div", attrs={"class": "actions"})
137 |                         comment_like_list = re.findall(r"\d+", div_class_actions.text.strip())
138 | 
139 |                         if comment_like_list:
140 |                             comment_likes = int(comment_like_list[0])
141 |                         else:
142 |                             comment_likes = 0
143 |                         comments[dict_primary_key]["likes"] = comment_likes
144 | 
145 |                         comm_no += 1
146 | 
147 |         browser.execute_script(
148 |             "window.scrollTo({}, {});".format(
149 |                 i * SCREEN_HEIGHT_IN_PIXELS, (i + 1) * SCREEN_HEIGHT_IN_PIXELS
150 |             )
151 |         )
152 |         time.sleep(SCROLL_WAIT_TIME)
153 | 
154 |     return comments
155 | 


--------------------------------------------------------------------------------
/Schneier on Security/schneier.py:
--------------------------------------------------------------------------------
  1 | from http.client import RemoteDisconnected
  2 | import time
  3 | import re
  4 | from bs4 import BeautifulSoup
  5 | from datetime import datetime
  6 | from selenium import webdriver
  7 | from selenium.webdriver.chrome.options import Options
  8 | from selenium.webdriver.support.ui import WebDriverWait
  9 | from selenium.webdriver.support import expected_conditions as ec
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.common.exceptions import TimeoutException
 12 | import psycopg2
 13 | 
 14 | 
 15 | HOST = "localhost"
 16 | DATABASE = "schneier"
 17 | USER = "postgres"
 18 | PASSWORD = "cavin"
 19 | 
 20 | 
 21 | def get_browser(headless=False, extensions=False, notifications=False, incognito=False):
 22 |     chrome_options = Options()
 23 |     if headless:
 24 |         chrome_options.add_argument("--headless")
 25 | 
 26 |     if not extensions:
 27 |         chrome_options.add_argument("--disable-extensions")
 28 | 
 29 |     if not notifications:
 30 |         chrome_options.add_argument("--disable-notifications")
 31 | 
 32 |     if incognito:
 33 |         chrome_options.add_argument("--incognito")
 34 | 
 35 |     driver = webdriver.Chrome(
 36 |         executable_path="C:\\Aptana Workspace\\chromedriver.exe", options=chrome_options
 37 |     )
 38 |     return driver
 39 | 
 40 | 
 41 | def main():
 42 |     conn = psycopg2.connect(host=HOST, database=DATABASE, user=USER, password=PASSWORD)
 43 |     cur = conn.cursor()
 44 |     driver = get_browser(headless=False, incognito=True)
 45 | 
 46 |     page_url = "https://www.schneier.com/"
 47 |     idx = 1
 48 | 
 49 |     while True:
 50 |         print(f"Processing page no. {idx}...")
 51 | 
 52 |         try:
 53 |             driver.set_page_load_timeout(200)
 54 |             driver.get(page_url)
 55 |         except TimeoutException:
 56 |             print(f"\t{page_url} - Timed out receiving message from renderer")
 57 |             continue
 58 |         except RemoteDisconnected:
 59 |             print(f"\tError 404: {page_url} not found.")
 60 |             continue
 61 | 
 62 |         WebDriverWait(driver, timeout=40).until(
 63 |             ec.presence_of_element_located((By.CLASS_NAME, "stepthrough"))
 64 |         )
 65 |         soup = BeautifulSoup(driver.page_source, "html.parser")
 66 | 
 67 |         ealier_entry = soup.find("div", attrs={"class": "stepthrough"}).find(
 68 |             "a", attrs={"class": "earlier"}
 69 |         )
 70 | 
 71 |         if not ealier_entry:
 72 |             break
 73 | 
 74 |         articles = soup.find("div", attrs={"id": "content"}).find_all("article")
 75 | 
 76 |         for article in articles:
 77 |             h2_tag = article.find("h2", attrs={"class": "entry"})
 78 |             id_ = h2_tag["id"]
 79 | 
 80 |             a_tag = h2_tag.find("a")
 81 |             url = a_tag["href"] if a_tag else None
 82 |             title = a_tag.text.strip() if a_tag else None
 83 | 
 84 |             body_tags = article.find_all(
 85 |                 re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None}
 86 |             )
 87 |             body = " ".join([k.text.strip() for k in body_tags])
 88 | 
 89 |             entry_tag = article.find("p", attrs={"class": "entry-tags"})
 90 |             tag_arr = [k.text for k in entry_tag.find_all("a")] if entry_tag else [""]
 91 |             tags = ", ".join(tag_arr)
 92 | 
 93 |             posted_tag = article.find("p", attrs={"class": "posted"})
 94 |             date_obj = None
 95 |             if posted_tag:
 96 |                 datetime_tag = posted_tag.find("a").text.strip()
 97 |                 date_obj = datetime.strptime(datetime_tag, "Posted on %B %d, %Y at %I:%M %p")
 98 | 
 99 |             query = """
100 |                     INSERT INTO article(id, url, title, body, tags, posted_datetime)
101 |                     SELECT sub_query.* FROM
102 |                     (SELECT %s  AS id, %s, %s, %s, %s, %s) sub_query
103 |                     LEFT JOIN article a ON sub_query.id = a.id
104 |                     WHERE a.id IS NULL;
105 |                     """
106 | 
107 |             data = (id_, url, title, body, tags, date_obj)
108 |             cur.execute(query, data)
109 | 
110 |             comment_arr = [k["href"] for k in posted_tag.find_all("a")]
111 |             if len(comment_arr) != 2:
112 |                 print(f"\tNo comments found for this article - {url}")
113 |                 continue
114 | 
115 |             print("\tProcessing comments...")
116 |             comment_url = comment_arr[1]
117 | 
118 |             try:
119 |                 driver.set_page_load_timeout(200)
120 |                 driver.get(comment_url)
121 |             except TimeoutException:
122 |                 print(f"\t{comment_url} - Timed out receiving message from renderer")
123 |                 continue
124 |             except RemoteDisconnected:
125 |                 print(f"\tError 404: {comment_url} not found.")
126 |                 continue
127 | 
128 |             WebDriverWait(driver, timeout=40).until(
129 |                 ec.presence_of_element_located((By.CLASS_NAME, "subscribe-comments"))
130 |             )
131 |             soup = BeautifulSoup(driver.page_source, "html.parser")
132 | 
133 |             comment_tags = soup.find_all("article")[1:]
134 | 
135 |             for comment in comment_tags:
136 |                 cid = comment.find("div", attrs={"class": re.compile("comment by-")})["id"]
137 | 
138 |                 comment_credit = comment.find("p", attrs={"class": "commentcredit"})
139 |                 commented_by = comment_credit.find("span").text.strip()
140 | 
141 |                 comment_body_tags = comment.find_all(
142 |                     re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None}
143 |                 )
144 |                 comment_body = " ".join([k.text.strip() for k in comment_body_tags])
145 | 
146 |                 posted_tag = comment_credit.find_all("a")[-1]
147 |                 date_obj = None
148 |                 if posted_tag:
149 |                     datetime_tag = posted_tag.text.strip()
150 |                     try:
151 |                         date_obj = datetime.strptime(datetime_tag, "%B %d, %Y %I:%M %p")
152 |                     except:
153 |                         date_obj = None
154 | 
155 |                 query = """
156 |                         INSERT INTO comments(id, article_id, comment, commented_by, posted_datetime)
157 |                         SELECT sub_query.* FROM
158 |                         (SELECT %s  AS id, %s, %s, %s, %s) sub_query
159 |                         LEFT JOIN comments c ON sub_query.id = c.id
160 |                         WHERE c.id IS NULL;
161 |                         """
162 | 
163 |                 data = (cid, id_, comment_body, commented_by, date_obj)
164 |                 cur.execute(query, data)
165 | 
166 |         page_url = ealier_entry["href"]
167 |         idx += 1
168 |         time.sleep(3)
169 | 
170 |     driver.quit()
171 |     conn.commit()
172 |     cur.close()
173 |     conn.close()
174 | 
175 |     print("DONE!!!")
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     main()
180 | 


--------------------------------------------------------------------------------
/BoingBoing/boingboing_scraping.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BoingBoing web scraping.
  3 | """
  4 | import urllib.request as urllib2
  5 | from datetime import datetime
  6 | import http
  7 | import sys
  8 | import time
  9 | import re
 10 | 
 11 | from bs4 import BeautifulSoup
 12 | from selenium import webdriver
 13 | 
 14 | from util import connect_to_database_server
 15 | from boingboing_comments import fetch_comment_info
 16 | 
 17 | # system default value is 1000; to avoid recursion depth to exceed,
 18 | sys.setrecursionlimit(10000)
 19 | 
 20 | # BoingBoing - A directory of mostly wonderful things
 21 | BB_URL = "https://boingboing.net/grid/"
 22 | 
 23 | # PostgreSQL Database name
 24 | DATABASE = "BoingBoing"
 25 | 
 26 | # Recursion breakpoint definition
 27 | START_CUTOFF_DATE = datetime.strptime("1/1/2004", "%m/%d/%Y").date()
 28 | END_CUTOFF_DATE = datetime.now().date()
 29 | 
 30 | if START_CUTOFF_DATE > END_CUTOFF_DATE:
 31 |     raise ValueError("Cutoff start date is greater than end date.")
 32 | 
 33 | if END_CUTOFF_DATE > datetime.now().date():
 34 |     raise ValueError("Cutoff end date is greater than current date.")
 35 | 
 36 | # posts filter
 37 | REQUIRED_TAGS = ["facebook", "social media"]
 38 | 
 39 | # Fixing the 'IncompleteRead' bug using http
 40 | # https://stackoverflow.com/questions/14149100/incompleteread-using-httplib
 41 | http.client.HTTPConnection._http_vsn = 10
 42 | http.client.HTTPConnection._http_vsn_str = "HTTP/1.0"
 43 | 
 44 | # firefox browser object
 45 | BROWSER = webdriver.Firefox()
 46 | 
 47 | 
 48 | def extract_post_story(div_id_story):
 49 |     """
 50 |     Extracts the post text contents, strips line breaks and whitespaces.
 51 |     """
 52 | 
 53 |     before_keyword = "SHARE /"
 54 |     post_story = div_id_story.get_text().strip().replace("\n", " ").replace("\r", "")
 55 | 
 56 |     return post_story[: post_story.find(before_keyword)]
 57 | 
 58 | 
 59 | def scrape(web_url, conn, cur, i, pg_no):
 60 |     """
 61 |     Scrapes the 'web_url' and inserts values to postgresql table.
 62 |     """
 63 | 
 64 |     # Added timeout for the error: http.client.RemoteDisconnected:
 65 |     # Remote end closed connection without response.
 66 |     try:
 67 |         page = urllib2.urlopen(web_url, timeout=200)
 68 |     except http.client.RemoteDisconnected:
 69 |         print("Error 404: {} not found.".format(web_url))
 70 |         return 0
 71 | 
 72 |     soup = BeautifulSoup(page, "html.parser")
 73 |     div_id_posts = soup.find("div", attrs={"id": "posts"})
 74 |     div_class_feature = div_id_posts.find_all("div", attrs={"class": "feature"})
 75 | 
 76 |     # If no features found on the page, return
 77 |     if len(div_class_feature) == 0:
 78 |         return 0
 79 | 
 80 |     # **************************************ARTICLES**************************************
 81 |     for feature in div_class_feature:
 82 |         a_class_headline = feature.find("a", attrs={"class": "headline"})
 83 |         try:
 84 |             post_page = urllib2.urlopen(a_class_headline["href"], timeout=200)
 85 |         except http.client.RemoteDisconnected:
 86 |             print("Error 404: {} not found.".format(a_class_headline["href"]))
 87 |             continue
 88 | 
 89 |         post_soup = BeautifulSoup(post_page, "html.parser")
 90 | 
 91 |         div_class_share = post_soup.find("div", attrs={"class": "share"})
 92 |         # if no comments on the article, skip article
 93 |         if not div_class_share:
 94 |             continue
 95 | 
 96 |         try:
 97 |             date_str = re.findall(r"\d+/\d+/\d+", a_class_headline["href"])[0]
 98 |             posteddate = datetime.strptime(date_str, "%Y/%m/%d").date()
 99 |         except ValueError:
100 |             posteddate = None
101 |             print("Date format error.")
102 | 
103 |         # apply the date filter
104 |         if posteddate < START_CUTOFF_DATE or posteddate > END_CUTOFF_DATE:
105 |             return 0
106 | 
107 |         article_headline = a_class_headline.text.strip()
108 | 
109 |         div_id_story = post_soup.find("div", attrs={"id": "story"})
110 |         if not div_id_story:
111 |             div_id_story = post_soup.find("article", attrs={"id": "text"})
112 |             if not div_id_story:
113 |                 div_id_story = post_soup.find("div", attrs={"id": "container"})
114 | 
115 |         post_txt = extract_post_story(div_id_story)
116 | 
117 |         h3_class_thetags = div_class_share.find("h3", attrs={"class": "thetags"})
118 |         if not h3_class_thetags:
119 |             post_tags = ""
120 |         else:
121 |             post_tags = ", ".join(
122 |                 [
123 |                     x.lower()
124 |                     for x in [tag.string.strip().replace("/", "") for tag in h3_class_thetags]
125 |                     if x != ""
126 |                 ]
127 |             )
128 | 
129 |         div_class_navbyline = post_soup.find("div", attrs={"class": "navbyline"})
130 |         if not div_class_navbyline:
131 |             div_class_navbyline = post_soup.find("header", attrs={"id": "bbheader"})
132 | 
133 |         span_class_author = div_class_navbyline.find("span", attrs={"class": "author"})
134 | 
135 |         # Apply the 'REQUIRED_TAGS' filter
136 |         is_ok = False
137 |         if REQUIRED_TAGS:
138 |             for elem in REQUIRED_TAGS:
139 |                 if elem in post_tags or elem in article_headline.lower():
140 |                     is_ok = True
141 |                     break
142 |         else:
143 |             is_ok = True
144 | 
145 |         if not is_ok:
146 |             continue
147 | 
148 |         query = "INSERT INTO posts(postno, a_page_url, headline, text, tags, author, posteddate) \
149 |                     VALUES (%s, %s, %s, %s, %s, %s, %s);"
150 |         data = (
151 |             i,
152 |             a_class_headline["href"],
153 |             article_headline,
154 |             post_txt,
155 |             post_tags,
156 |             [x.string for x in span_class_author.find("a")][0],
157 |             posteddate,
158 |         )
159 | 
160 |         cur.execute(query, data)
161 |         print("FOUND POST: {}, {}".format(i, article_headline))
162 | 
163 |         # **************************************COMMENTS**************************************
164 |         a_class_bbs = div_class_share.find("a", attrs={"class": "bbs"})
165 |         comments = fetch_comment_info(BROWSER, a_class_bbs["href"], i, cur)
166 | 
167 |         for _, value in comments.items():
168 |             if value["comm_text"] != "":
169 |                 cquery = "INSERT INTO comments(commentno, postno, \
170 |                 comments, postedby, likes, posteddate) VALUES (%s, %s, %s, %s, %s, %s);"
171 |                 cdata = (
172 |                     value["comm_no"],
173 |                     i,
174 |                     value["comm_text"],
175 |                     value["postedby"],
176 |                     value["likes"],
177 |                     value["date"],
178 |                 )
179 |                 cur.execute(cquery, cdata)
180 |         i += 1
181 | 
182 |     # Construct next page url.
183 |     print("Page no: {} - {}".format(pg_no, posteddate))
184 |     pg_no += 1
185 |     next_page_url = BB_URL + "page/{}/".format(pg_no)
186 | 
187 |     # recursive logic
188 |     scrape(next_page_url, conn, cur, i, pg_no)
189 | 
190 | 
191 | def main():
192 |     """
193 |     Entry-point for the function.
194 |     """
195 |     start_time = time.time()
196 |     conn_obj = connect_to_database_server(DATABASE)
197 | 
198 |     if conn_obj == -1:
199 |         print("Connection to PostgreSQL Database: {} failed.".format(DATABASE))
200 |         sys.exit(0)
201 |     else:
202 |         conn = conn_obj[0]
203 |         cur = conn_obj[1]
204 | 
205 |     scrape(BB_URL, conn, cur, i=1, pg_no=1)
206 | 
207 |     conn.commit()
208 |     cur.close()
209 |     conn.close()
210 | 
211 |     print("Webdata scraped successfully in {} seconds.".format(time.time() - start_time))
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     main()
216 | 


--------------------------------------------------------------------------------