├── data ├── README.md ├── Pie_authors.png ├── Pie_comments.png ├── Posts_Years.png └── wc_post_tags.png ├── Schneier on Security ├── archived.zip ├── data │ ├── README.md │ └── sample_articles.csv └── schneier.py ├── README.md ├── util.py ├── BoingBoing ├── README.md ├── boingboing_comments.py └── boingboing_scraping.py └── LICENSE /data/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/Pie_authors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Pie_authors.png -------------------------------------------------------------------------------- /data/Pie_comments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Pie_comments.png -------------------------------------------------------------------------------- /data/Posts_Years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/Posts_Years.png -------------------------------------------------------------------------------- /data/wc_post_tags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/data/wc_post_tags.png -------------------------------------------------------------------------------- /Schneier on Security/archived.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAVIND46016/Web-Scraping/HEAD/Schneier on Security/archived.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web-Scraping 2 | ### BoingBoing 3 | The posts fetched are filtered based on tags ['facebook', 'social media'] (parameterized). There are a total of 631 posts
4 | and 10299 comments associated with it. 5 | 6 | -------------------------------------------------------------------------------- /Schneier on Security/data/README.md: -------------------------------------------------------------------------------- 1 | For the complete set of articles and comments till 15th Mar, 2020, refer to this link for the csv file: 2 | 3 | https://drive.google.com/drive/folders/1CCeREAh7C7Htjpa-iHoit2cSbYrS-rSA?usp=sharing 4 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility function to connect to PostgreSQL server. 3 | """ 4 | import psycopg2 5 | 6 | HOST = "localhost" 7 | USER = "postgres" 8 | PASSWORD = "xxxxxxxxxxx" 9 | 10 | 11 | def connect_to_database_server(dbname): 12 | """ 13 | Connects to PostgreSQL server database 'dbname' and 14 | returns a connection object and cursor. 15 | """ 16 | try: 17 | conn = psycopg2.connect(host=HOST, database=dbname, user=USER, password=PASSWORD) 18 | cur = conn.cursor() 19 | return [conn, cur] 20 | except psycopg2.OperationalError: 21 | return -1 22 | -------------------------------------------------------------------------------- /BoingBoing/README.md: -------------------------------------------------------------------------------- 1 | 2 | # BoingBoing - A Directory of Mostly Wonderful Things: 3 | ### (https://boingboing.net/) 4 | 5 | #### (i) No. of posts fetched from years 2005 - 2018. 6 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Posts_Years.png) 7 | 8 | #### (ii) The top 5 authors based on the number of published posts. 9 | The top 5 authors contribute to approx. 90% of the overall number of published posts. 10 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Pie_authors.png) 11 | 12 | #### (iii) The tags that appear frequently with the filtered posts. 13 | (Note: Eliminated ['facebook', 'social media'], as the posts were scraped exclusively for those tags. 14 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/wc_post_tags.png) 15 | 16 | #### (iv) The top 10 users based on the number of comments. 17 | ![alt text](https://github.com/CAVIND46016/Web-Scraping/blob/master/data/Pie_comments.png) 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Cavin Dsouza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Schneier on Security/data/sample_articles.csv: -------------------------------------------------------------------------------- 1 | id,url,title,body,tags,posted_datetime 2 | a000808,https://www.schneier.com/blog/archives/2006/04/announcing_movi.html,Announcing: Movie-Plot Threat Contest,"NOTE: If you have a blog, please spread the word. For a while now, I have been writing about our penchant for ""movie-plot threats"": terrorist fears based on very specific attack scenarios. Terrorists with crop dusters, terrorists exploding baby carriages in subways, terrorists filling school buses with explosives -- these are all movie-plot threats. They're good for scaring people, but it's just silly to build national security policy around them. But if we're going to worry about unlikely attacks, why can't they be exciting and innovative ones? If Americans are going to be scared, shouldn't they be scared of things that are really scary? ""Blowing up the Super Bowl"" is a movie plot to be sure, but it's not a very good movie. Let's kick this up a notch. It is in this spirit I announce the (possibly First) Movie-Plot Threat Contest. Entrants are invited to submit the most unlikely, yet still plausible, terrorist attack scenarios they can come up with. Your goal: cause terror. Make the American people notice. Inflict lasting damage on the U.S. economy. Change the political landscape, or the culture. The more grandiose the goal, the better. Assume an attacker profile on the order of 9/11: 20 to 30 unskilled people, and about $500,000 with which to buy skills, equipment, etc. Post your movie plots here on this blog. Judging will be by me, swayed by popular acclaim in the blog comments section. The prize will be an autographed copy of Beyond Fear. And if I can swing it, a phone call with a real live movie producer. Beyond Fear Entries close at the end of the month -- April 30 -- so Crypto-Gram readers can also play. Crypto-Gram This is not an April Fool's joke, although it's in the spirit of the season. The purpose of this contest is absurd humor, but I hope it also makes a point. Terrorism is a real threat, but we're not any safer through security measures that require us to correctly guess what the terrorists are going to do next. Good luck. EDITED TO ADD (4/4): There are hundreds of ideas here. EDITED TO ADD (4/22): Update here. Two clicks for more privacy: The Facebook Like button will be enabled once you click here. No data is loaded from Facebook until you enable the button. Click the [i] button for more information.not connected to FacebookTwo clicks for more privacy: The Tweet this button will be enabled once you click here. No data is loaded from Twitter until you enable the button. Click the [i] button for more information.not connected to TwitterTwo clicks for more privacy: The Google+ button will be enabled once you click here. No data is loaded from Google until you enable the button. Click the [i] button for more information.not connected to Google+If you click to activate the share buttons, data will be loaded from a third party (Facebook, Twitter, Google), allowing them to track your visit to schneier.com. For more details click the [i] button.SettingsPermanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons:FacebookTwitterGoogle+ Permanently enable share buttons: Facebook Twitter Google+","contests, fear, movie-plot threat contests, movie-plot threats, terrorism",2006-04-01 09:35:00 3 | -------------------------------------------------------------------------------- /BoingBoing/boingboing_comments.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL: https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html 3 | The page setup is such that, by default, 4 | approx. 19
tags on boingboing comments are 5 | loaded at startup and the remaining get loaded once the page is manually scrolled down. 6 | We use selenium web-driver to achieve manual scrolling. 7 | We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox 8 | or the chromedriver (latest release 2.3.6 --> 9 | https://chromedriver.storage.googleapis.com/index.html?path=2.36/) 10 | Note: the executable driver file must be in the same directory as the .py file. 11 | """ 12 | from datetime import datetime 13 | import http 14 | import re 15 | import time 16 | import math 17 | 18 | from bs4 import BeautifulSoup 19 | from selenium.webdriver.support.ui import WebDriverWait 20 | from selenium.webdriver.support import expected_conditions as ec 21 | from selenium.webdriver.common.by import By 22 | 23 | SCREEN_HEIGHT_IN_PIXELS = 1080 24 | COMMENTS_SCREEN_SIZE = 3 25 | SCROLL_WAIT_TIME = 1 26 | 27 | # Fixing the 'IncompleteRead' bug using http 28 | # https://stackoverflow.com/questions/14149100/incompleteread-using-httplib 29 | http.client.HTTPConnection._http_vsn = 10 30 | http.client.HTTPConnection._http_vsn_str = "HTTP/1.0" 31 | 32 | 33 | def fetch_comment_info(browser, url, postno, cur, delay=100): 34 | """ 35 | Fetches user comments in 'url'. 36 | """ 37 | 38 | comments = {} 39 | # indicates presence of div_class_share but no a_class_bbs 40 | try: 41 | # Added timeout for the error: http.client.RemoteDisconnected: 42 | # Remote end closed connection without response 43 | browser.set_page_load_timeout(200) 44 | browser.get(url) 45 | except http.client.RemoteDisconnected: 46 | return comments 47 | 48 | WebDriverWait(browser, delay).until( 49 | ec.presence_of_element_located((By.CLASS_NAME, "container")) 50 | ) 51 | 52 | soup = BeautifulSoup(browser.page_source, "html.parser") 53 | 54 | # Replies, Views, Users, Likes and Links 55 | topic_str = ["replies", "view", "user", "like", "link"] 56 | topic_map = [0] * len(topic_str) 57 | 58 | div_class_topicmap = soup.find("div", attrs={"class": "topic-map"}) 59 | if div_class_topicmap: 60 | li_all = div_class_topicmap.find_all("li") 61 | for li_tag in li_all: 62 | li_text = li_tag.text.strip() 63 | span_class_number = li_tag.find("span") 64 | str_found = False 65 | for i in topic_str: 66 | if i in li_text: 67 | str_found = True 68 | break 69 | 70 | if str_found and span_class_number: 71 | if "k" in span_class_number.text: 72 | if "." in span_class_number.text: 73 | tmp = re.findall(r"\d+\.\d+", span_class_number.text)[0] 74 | else: 75 | tmp = re.findall(r"\d+", span_class_number.text)[0] 76 | 77 | num = int(float(tmp) * 1000) 78 | else: 79 | num = int(span_class_number.text) 80 | 81 | for i, _ in enumerate(topic_str): 82 | if topic_str[i] in li_text: 83 | topic_map[i] = num 84 | 85 | # Replies, Views, Users, Likes and Links 86 | 87 | tmp = 0 88 | query = "UPDATE posts SET c_page_url = %s, replies = %s, views = %s, \ 89 | users = %s, likes = %s, links = %s WHERE postno = %s;" 90 | if topic_map[0] >= 1: 91 | tmp = topic_map[0] - 1 92 | data = (url, tmp, topic_map[1], topic_map[2], topic_map[3], topic_map[4], postno) 93 | 94 | cur.execute(query, data) 95 | 96 | scrolls = math.ceil(topic_map[0] / COMMENTS_SCREEN_SIZE) 97 | 98 | for i in range(scrolls): 99 | soup = BeautifulSoup(browser.page_source, "html.parser") 100 | div_class_comment = soup.find_all( 101 | "div", attrs={"class": "topic-post clearfix regular"} 102 | ) + soup.find_all( 103 | "div", 104 | attrs={ 105 | "class": "topic-post clearfix topic-owner \ 106 | group-editors regular" 107 | }, 108 | ) 109 | 110 | comm_no = 1 111 | for dc_comment in div_class_comment: 112 | div_class_user_card = dc_comment.find("div", attrs={"class": "names trigger-user-card"}) 113 | if div_class_user_card: 114 | span_class_firstusername = dc_comment.find("span") 115 | if span_class_firstusername: 116 | postedby = span_class_firstusername.find("a").text 117 | 118 | post_date = dc_comment.find("div", attrs={"class": "post-info post-date"}) 119 | a_class_post_date = post_date.find("a", attrs={"class": "post-date"}) 120 | posteddate = a_class_post_date.find("span")["title"] 121 | div_class_cooked = dc_comment.find("div", attrs={"class": "cooked"}) 122 | comm_text = div_class_cooked.text.strip().replace("\n", "").replace("\r", "") 123 | 124 | dict_primary_key = postedby + " " + posteddate + " " + comm_text 125 | 126 | if dict_primary_key not in comments: 127 | comments[dict_primary_key] = {} 128 | comments[dict_primary_key]["postedby"] = postedby 129 | comments[dict_primary_key]["date"] = datetime.strptime( 130 | posteddate, "%b %d, %Y %I:%M %p" 131 | ).date() 132 | 133 | comments[dict_primary_key]["comm_no"] = comm_no 134 | comments[dict_primary_key]["comm_text"] = comm_text 135 | 136 | div_class_actions = dc_comment.find("div", attrs={"class": "actions"}) 137 | comment_like_list = re.findall(r"\d+", div_class_actions.text.strip()) 138 | 139 | if comment_like_list: 140 | comment_likes = int(comment_like_list[0]) 141 | else: 142 | comment_likes = 0 143 | comments[dict_primary_key]["likes"] = comment_likes 144 | 145 | comm_no += 1 146 | 147 | browser.execute_script( 148 | "window.scrollTo({}, {});".format( 149 | i * SCREEN_HEIGHT_IN_PIXELS, (i + 1) * SCREEN_HEIGHT_IN_PIXELS 150 | ) 151 | ) 152 | time.sleep(SCROLL_WAIT_TIME) 153 | 154 | return comments 155 | -------------------------------------------------------------------------------- /Schneier on Security/schneier.py: -------------------------------------------------------------------------------- 1 | from http.client import RemoteDisconnected 2 | import time 3 | import re 4 | from bs4 import BeautifulSoup 5 | from datetime import datetime 6 | from selenium import webdriver 7 | from selenium.webdriver.chrome.options import Options 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as ec 10 | from selenium.webdriver.common.by import By 11 | from selenium.common.exceptions import TimeoutException 12 | import psycopg2 13 | 14 | 15 | HOST = "localhost" 16 | DATABASE = "schneier" 17 | USER = "postgres" 18 | PASSWORD = "cavin" 19 | 20 | 21 | def get_browser(headless=False, extensions=False, notifications=False, incognito=False): 22 | chrome_options = Options() 23 | if headless: 24 | chrome_options.add_argument("--headless") 25 | 26 | if not extensions: 27 | chrome_options.add_argument("--disable-extensions") 28 | 29 | if not notifications: 30 | chrome_options.add_argument("--disable-notifications") 31 | 32 | if incognito: 33 | chrome_options.add_argument("--incognito") 34 | 35 | driver = webdriver.Chrome( 36 | executable_path="C:\\Aptana Workspace\\chromedriver.exe", options=chrome_options 37 | ) 38 | return driver 39 | 40 | 41 | def main(): 42 | conn = psycopg2.connect(host=HOST, database=DATABASE, user=USER, password=PASSWORD) 43 | cur = conn.cursor() 44 | driver = get_browser(headless=False, incognito=True) 45 | 46 | page_url = "https://www.schneier.com/" 47 | idx = 1 48 | 49 | while True: 50 | print(f"Processing page no. {idx}...") 51 | 52 | try: 53 | driver.set_page_load_timeout(200) 54 | driver.get(page_url) 55 | except TimeoutException: 56 | print(f"\t{page_url} - Timed out receiving message from renderer") 57 | continue 58 | except RemoteDisconnected: 59 | print(f"\tError 404: {page_url} not found.") 60 | continue 61 | 62 | WebDriverWait(driver, timeout=40).until( 63 | ec.presence_of_element_located((By.CLASS_NAME, "stepthrough")) 64 | ) 65 | soup = BeautifulSoup(driver.page_source, "html.parser") 66 | 67 | ealier_entry = soup.find("div", attrs={"class": "stepthrough"}).find( 68 | "a", attrs={"class": "earlier"} 69 | ) 70 | 71 | if not ealier_entry: 72 | break 73 | 74 | articles = soup.find("div", attrs={"id": "content"}).find_all("article") 75 | 76 | for article in articles: 77 | h2_tag = article.find("h2", attrs={"class": "entry"}) 78 | id_ = h2_tag["id"] 79 | 80 | a_tag = h2_tag.find("a") 81 | url = a_tag["href"] if a_tag else None 82 | title = a_tag.text.strip() if a_tag else None 83 | 84 | body_tags = article.find_all( 85 | re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None} 86 | ) 87 | body = " ".join([k.text.strip() for k in body_tags]) 88 | 89 | entry_tag = article.find("p", attrs={"class": "entry-tags"}) 90 | tag_arr = [k.text for k in entry_tag.find_all("a")] if entry_tag else [""] 91 | tags = ", ".join(tag_arr) 92 | 93 | posted_tag = article.find("p", attrs={"class": "posted"}) 94 | date_obj = None 95 | if posted_tag: 96 | datetime_tag = posted_tag.find("a").text.strip() 97 | date_obj = datetime.strptime(datetime_tag, "Posted on %B %d, %Y at %I:%M %p") 98 | 99 | query = """ 100 | INSERT INTO article(id, url, title, body, tags, posted_datetime) 101 | SELECT sub_query.* FROM 102 | (SELECT %s AS id, %s, %s, %s, %s, %s) sub_query 103 | LEFT JOIN article a ON sub_query.id = a.id 104 | WHERE a.id IS NULL; 105 | """ 106 | 107 | data = (id_, url, title, body, tags, date_obj) 108 | cur.execute(query, data) 109 | 110 | comment_arr = [k["href"] for k in posted_tag.find_all("a")] 111 | if len(comment_arr) != 2: 112 | print(f"\tNo comments found for this article - {url}") 113 | continue 114 | 115 | print("\tProcessing comments...") 116 | comment_url = comment_arr[1] 117 | 118 | try: 119 | driver.set_page_load_timeout(200) 120 | driver.get(comment_url) 121 | except TimeoutException: 122 | print(f"\t{comment_url} - Timed out receiving message from renderer") 123 | continue 124 | except RemoteDisconnected: 125 | print(f"\tError 404: {comment_url} not found.") 126 | continue 127 | 128 | WebDriverWait(driver, timeout=40).until( 129 | ec.presence_of_element_located((By.CLASS_NAME, "subscribe-comments")) 130 | ) 131 | soup = BeautifulSoup(driver.page_source, "html.parser") 132 | 133 | comment_tags = soup.find_all("article")[1:] 134 | 135 | for comment in comment_tags: 136 | cid = comment.find("div", attrs={"class": re.compile("comment by-")})["id"] 137 | 138 | comment_credit = comment.find("p", attrs={"class": "commentcredit"}) 139 | commented_by = comment_credit.find("span").text.strip() 140 | 141 | comment_body_tags = comment.find_all( 142 | re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None} 143 | ) 144 | comment_body = " ".join([k.text.strip() for k in comment_body_tags]) 145 | 146 | posted_tag = comment_credit.find_all("a")[-1] 147 | date_obj = None 148 | if posted_tag: 149 | datetime_tag = posted_tag.text.strip() 150 | try: 151 | date_obj = datetime.strptime(datetime_tag, "%B %d, %Y %I:%M %p") 152 | except: 153 | date_obj = None 154 | 155 | query = """ 156 | INSERT INTO comments(id, article_id, comment, commented_by, posted_datetime) 157 | SELECT sub_query.* FROM 158 | (SELECT %s AS id, %s, %s, %s, %s) sub_query 159 | LEFT JOIN comments c ON sub_query.id = c.id 160 | WHERE c.id IS NULL; 161 | """ 162 | 163 | data = (cid, id_, comment_body, commented_by, date_obj) 164 | cur.execute(query, data) 165 | 166 | page_url = ealier_entry["href"] 167 | idx += 1 168 | time.sleep(3) 169 | 170 | driver.quit() 171 | conn.commit() 172 | cur.close() 173 | conn.close() 174 | 175 | print("DONE!!!") 176 | 177 | 178 | if __name__ == "__main__": 179 | main() 180 | -------------------------------------------------------------------------------- /BoingBoing/boingboing_scraping.py: -------------------------------------------------------------------------------- 1 | """ 2 | BoingBoing web scraping. 3 | """ 4 | import urllib.request as urllib2 5 | from datetime import datetime 6 | import http 7 | import sys 8 | import time 9 | import re 10 | 11 | from bs4 import BeautifulSoup 12 | from selenium import webdriver 13 | 14 | from util import connect_to_database_server 15 | from boingboing_comments import fetch_comment_info 16 | 17 | # system default value is 1000; to avoid recursion depth to exceed, 18 | sys.setrecursionlimit(10000) 19 | 20 | # BoingBoing - A directory of mostly wonderful things 21 | BB_URL = "https://boingboing.net/grid/" 22 | 23 | # PostgreSQL Database name 24 | DATABASE = "BoingBoing" 25 | 26 | # Recursion breakpoint definition 27 | START_CUTOFF_DATE = datetime.strptime("1/1/2004", "%m/%d/%Y").date() 28 | END_CUTOFF_DATE = datetime.now().date() 29 | 30 | if START_CUTOFF_DATE > END_CUTOFF_DATE: 31 | raise ValueError("Cutoff start date is greater than end date.") 32 | 33 | if END_CUTOFF_DATE > datetime.now().date(): 34 | raise ValueError("Cutoff end date is greater than current date.") 35 | 36 | # posts filter 37 | REQUIRED_TAGS = ["facebook", "social media"] 38 | 39 | # Fixing the 'IncompleteRead' bug using http 40 | # https://stackoverflow.com/questions/14149100/incompleteread-using-httplib 41 | http.client.HTTPConnection._http_vsn = 10 42 | http.client.HTTPConnection._http_vsn_str = "HTTP/1.0" 43 | 44 | # firefox browser object 45 | BROWSER = webdriver.Firefox() 46 | 47 | 48 | def extract_post_story(div_id_story): 49 | """ 50 | Extracts the post text contents, strips line breaks and whitespaces. 51 | """ 52 | 53 | before_keyword = "SHARE /" 54 | post_story = div_id_story.get_text().strip().replace("\n", " ").replace("\r", "") 55 | 56 | return post_story[: post_story.find(before_keyword)] 57 | 58 | 59 | def scrape(web_url, conn, cur, i, pg_no): 60 | """ 61 | Scrapes the 'web_url' and inserts values to postgresql table. 62 | """ 63 | 64 | # Added timeout for the error: http.client.RemoteDisconnected: 65 | # Remote end closed connection without response. 66 | try: 67 | page = urllib2.urlopen(web_url, timeout=200) 68 | except http.client.RemoteDisconnected: 69 | print("Error 404: {} not found.".format(web_url)) 70 | return 0 71 | 72 | soup = BeautifulSoup(page, "html.parser") 73 | div_id_posts = soup.find("div", attrs={"id": "posts"}) 74 | div_class_feature = div_id_posts.find_all("div", attrs={"class": "feature"}) 75 | 76 | # If no features found on the page, return 77 | if len(div_class_feature) == 0: 78 | return 0 79 | 80 | # **************************************ARTICLES************************************** 81 | for feature in div_class_feature: 82 | a_class_headline = feature.find("a", attrs={"class": "headline"}) 83 | try: 84 | post_page = urllib2.urlopen(a_class_headline["href"], timeout=200) 85 | except http.client.RemoteDisconnected: 86 | print("Error 404: {} not found.".format(a_class_headline["href"])) 87 | continue 88 | 89 | post_soup = BeautifulSoup(post_page, "html.parser") 90 | 91 | div_class_share = post_soup.find("div", attrs={"class": "share"}) 92 | # if no comments on the article, skip article 93 | if not div_class_share: 94 | continue 95 | 96 | try: 97 | date_str = re.findall(r"\d+/\d+/\d+", a_class_headline["href"])[0] 98 | posteddate = datetime.strptime(date_str, "%Y/%m/%d").date() 99 | except ValueError: 100 | posteddate = None 101 | print("Date format error.") 102 | 103 | # apply the date filter 104 | if posteddate < START_CUTOFF_DATE or posteddate > END_CUTOFF_DATE: 105 | return 0 106 | 107 | article_headline = a_class_headline.text.strip() 108 | 109 | div_id_story = post_soup.find("div", attrs={"id": "story"}) 110 | if not div_id_story: 111 | div_id_story = post_soup.find("article", attrs={"id": "text"}) 112 | if not div_id_story: 113 | div_id_story = post_soup.find("div", attrs={"id": "container"}) 114 | 115 | post_txt = extract_post_story(div_id_story) 116 | 117 | h3_class_thetags = div_class_share.find("h3", attrs={"class": "thetags"}) 118 | if not h3_class_thetags: 119 | post_tags = "" 120 | else: 121 | post_tags = ", ".join( 122 | [ 123 | x.lower() 124 | for x in [tag.string.strip().replace("/", "") for tag in h3_class_thetags] 125 | if x != "" 126 | ] 127 | ) 128 | 129 | div_class_navbyline = post_soup.find("div", attrs={"class": "navbyline"}) 130 | if not div_class_navbyline: 131 | div_class_navbyline = post_soup.find("header", attrs={"id": "bbheader"}) 132 | 133 | span_class_author = div_class_navbyline.find("span", attrs={"class": "author"}) 134 | 135 | # Apply the 'REQUIRED_TAGS' filter 136 | is_ok = False 137 | if REQUIRED_TAGS: 138 | for elem in REQUIRED_TAGS: 139 | if elem in post_tags or elem in article_headline.lower(): 140 | is_ok = True 141 | break 142 | else: 143 | is_ok = True 144 | 145 | if not is_ok: 146 | continue 147 | 148 | query = "INSERT INTO posts(postno, a_page_url, headline, text, tags, author, posteddate) \ 149 | VALUES (%s, %s, %s, %s, %s, %s, %s);" 150 | data = ( 151 | i, 152 | a_class_headline["href"], 153 | article_headline, 154 | post_txt, 155 | post_tags, 156 | [x.string for x in span_class_author.find("a")][0], 157 | posteddate, 158 | ) 159 | 160 | cur.execute(query, data) 161 | print("FOUND POST: {}, {}".format(i, article_headline)) 162 | 163 | # **************************************COMMENTS************************************** 164 | a_class_bbs = div_class_share.find("a", attrs={"class": "bbs"}) 165 | comments = fetch_comment_info(BROWSER, a_class_bbs["href"], i, cur) 166 | 167 | for _, value in comments.items(): 168 | if value["comm_text"] != "": 169 | cquery = "INSERT INTO comments(commentno, postno, \ 170 | comments, postedby, likes, posteddate) VALUES (%s, %s, %s, %s, %s, %s);" 171 | cdata = ( 172 | value["comm_no"], 173 | i, 174 | value["comm_text"], 175 | value["postedby"], 176 | value["likes"], 177 | value["date"], 178 | ) 179 | cur.execute(cquery, cdata) 180 | i += 1 181 | 182 | # Construct next page url. 183 | print("Page no: {} - {}".format(pg_no, posteddate)) 184 | pg_no += 1 185 | next_page_url = BB_URL + "page/{}/".format(pg_no) 186 | 187 | # recursive logic 188 | scrape(next_page_url, conn, cur, i, pg_no) 189 | 190 | 191 | def main(): 192 | """ 193 | Entry-point for the function. 194 | """ 195 | start_time = time.time() 196 | conn_obj = connect_to_database_server(DATABASE) 197 | 198 | if conn_obj == -1: 199 | print("Connection to PostgreSQL Database: {} failed.".format(DATABASE)) 200 | sys.exit(0) 201 | else: 202 | conn = conn_obj[0] 203 | cur = conn_obj[1] 204 | 205 | scrape(BB_URL, conn, cur, i=1, pg_no=1) 206 | 207 | conn.commit() 208 | cur.close() 209 | conn.close() 210 | 211 | print("Webdata scraped successfully in {} seconds.".format(time.time() - start_time)) 212 | 213 | 214 | if __name__ == "__main__": 215 | main() 216 | --------------------------------------------------------------------------------