├── app ├── global.R ├── ui.R └── server.R ├── Dockerfile ├── data └── wsjsections.csv ├── .gitignore ├── README.md ├── objectives.md └── scraping └── scrape.py /app/global.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(forcats) 3 | library(tm) 4 | library(wordcloud) 5 | library(memoise) 6 | library(SnowballC) 7 | library(RColorBrewer) 8 | library(ggplot2) 9 | 10 | # 11 | variabledata <- read.csv("data/variabletypes.csv") 12 | # wsjtextdata = readLines('data/wsjtextdata.txt') 13 | # docs <- Corpus(VectorSource(wsjtextdata)) 14 | 15 | docs <- Corpus(DirSource('text/')) 16 | 17 | #docs <- tm_map(docs, stripWhitespace) 18 | 19 | #docs <- tm_map(docs, tolower) 20 | 21 | #docs <- tm_map(docs, stemDocument) 22 | 23 | sectionsdf <- read.csv('data/wsjsections.csv') 24 | sectionsdf = sectionsdf[order(-sectionsdf$AverageComments),] 25 | 26 | wsj2 = read.csv('data/wsj2.csv') 27 | 28 | wsj3 = read.csv('data/wsj3.csv') 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.9-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /usr/src/app 6 | 7 | # Install system dependencies 8 | RUN apt-get update && apt-get install -y \ 9 | build-essential \ 10 | libffi-dev \ 11 | libssl-dev \ 12 | python3-dev 13 | 14 | # Copy the current directory contents into the container at /usr/src/app 15 | COPY . . 16 | 17 | # Install any needed packages specified in requirements.txt 18 | RUN pip install --no-cache-dir -r requirements.txt 19 | 20 | # Make port 8888 available to the world outside this container 21 | EXPOSE 8888 22 | 23 | # Run jupyter notebook when the container launches 24 | CMD ["jupyter", "notebook", "--ip='0.0.0.0'", "--port=8888", "--no-browser", "--allow-root"] 25 | -------------------------------------------------------------------------------- /data/wsjsections.csv: -------------------------------------------------------------------------------- 1 | SectionName,AverageComments 2 | ART REVIEW,4.846153846 3 | BUSINESS,60.63188168 4 | OPINION,282.4523077 5 | U.S.,151.3304647 6 | LIFE & ARTS,66.1961039 7 | MARKETS,28.72707424 8 | BOOKS,15.86723164 9 | PRO BANKRUPTCY,6 10 | WORLD,119.4365013 11 | ECONOMY,112.479798 12 | REAL ESTATE,34.94545455 13 | WSJ LOGISTICS REPORT,1.4 14 | POLITICS,367.9603399 15 | WSJ. MAGAZINE,6.178571429 16 | RISK & COMPLIANCE JOURNAL,3.707317073 17 | TECH,76.36797753 18 | LOGISTICS REPORT,7.054054054 19 | EXHIBITION REVIEW,4.545454545 20 | CFO JOURNAL,3.895833333 21 | PERSONAL BOARD OF DIRECTORS,6 22 | DANCE REVIEW,1 23 | CMO TODAY,77.20588235 24 | WHATS NEWS BUSINESS FINANCE,1 25 | CIO JOURNAL,5.363636364 26 | UP NEXT,6.875 27 | TELEVISION REVIEW,14.17857143 28 | MANAGEMENT,37.83333333 29 | ARTS,44.82258065 30 | PRO PE EXITS,1 31 | MUSIC REVIEW,3.818181818 32 | PRO VC INDUSTRY NEWS,4 33 | SHIPPING MATTERS,20.25 34 | OPERA REVIEW,2.25 35 | FILM REVIEW,23.47826087 36 | PRO BANKRUPTCY INVESTORS,1 37 | MASTERPIECE,9.818181818 38 | PRO PE DEALS,3 39 | THE ARTIST,3 40 | ARTS & ENTERTAINMENT,11.83333333 41 | PRO VC NEW MONEY,4 42 | FOOD,7 43 | TRENDING,53.5 44 | ARTIFICIAL INTELLIGENCE,8 45 | PRO BANKRUPTCY DISTRESS,37 46 | EXPERIENCE REPORT,10 47 | THE CAPTAIN CLASS,75.5 48 | A-HED,62.43478261 49 | PRO PE LEGAL REGULATORY,15 50 | ON THE COVER,16 51 | PRO VC COMMENTARY ANALYSIS,25 52 | CENTRAL BANKS RESEARCH,48 53 | INFLUENCERS,134 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | pip-wheel-metadata/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # Virtual environments 27 | .env 28 | .venv 29 | env/ 30 | venv/ 31 | ENV/ 32 | env.bak/ 33 | venv.bak/ 34 | 35 | # Jupyter Notebook 36 | .ipynb_checkpoints 37 | 38 | # IPython 39 | profile_default/ 40 | ipython_config.py 41 | 42 | # pyenv 43 | .python-version 44 | 45 | # pipenv 46 | Pipfile.lock 47 | 48 | # PEP 582 49 | __pypackages__/ 50 | 51 | # Celery 52 | celerybeat-schedule 53 | celerybeat.pid 54 | 55 | # SageMath parsed files 56 | *.sage.py 57 | 58 | # Environments 59 | .env 60 | .env.local 61 | .env.development.local 62 | .env.test.local 63 | .env.production.local 64 | 65 | # Spyder project settings 66 | .spyderproject 67 | .spyproject 68 | 69 | # Rope project settings 70 | .ropeproject 71 | 72 | # mkdocs documentation 73 | /site 74 | 75 | # mypy 76 | .mypy_cache/ 77 | .dmypy.json 78 | dmypy.json 79 | 80 | # Pyre type checker 81 | .pyre/ 82 | 83 | # R 84 | .Rhistory 85 | .RData 86 | .Ruserdata 87 | .Rproj.user/ 88 | 89 | # R Shiny 90 | rsconnect/ 91 | 92 | # Data files 93 | *.csv 94 | *.xlsx 95 | *.xls 96 | *.json 97 | *.parquet 98 | *.pickle 99 | *.pkl 100 | 101 | # Logs 102 | *.log 103 | logs/ 104 | 105 | # OS generated files 106 | .DS_Store 107 | .DS_Store? 108 | ._* 109 | .Spotlight-V100 110 | .Trashes 111 | ehthumbs.db 112 | Thumbs.db 113 | 114 | # IDE 115 | .vscode/ 116 | .idea/ 117 | *.swp 118 | *.swo 119 | *~ 120 | 121 | # Docker 122 | .dockerignore 123 | 124 | # Temporary files 125 | *.tmp 126 | *.temp 127 | temp/ 128 | tmp/ 129 | 130 | # Scraped data (if large) 131 | scraped_data/ 132 | raw_data/ 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WSJ Web Scraping & NLP Analysis 2 | 3 | A comprehensive analysis of Wall Street Journal articles to investigate relationships between article sentiment, reader engagement, and financial market movements. 4 | 5 | ## Table of Contents 6 | 7 | - [Project Overview](#project-overview) 8 | - [Research Questions](#research-questions) 9 | - [Data Collection](#data-collection) 10 | - [Natural Language Processing](#natural-language-processing) 11 | - [Research Findings](#research-findings) 12 | - [Project Structure](#project-structure) 13 | - [Getting Started](#getting-started) 14 | - [Docker Setup](#docker-setup) 15 | - [Results & Applications](#results--applications) 16 | 17 | ## Project Overview 18 | 19 | This project investigates the relationship between Wall Street Journal article sentiment and two key metrics: 20 | - **User engagement** (measured by comment count) 21 | - **S&P 500 market returns** 22 | 23 | The analysis leverages web scraping and natural language processing to extract insights from 22,772 WSJ articles published between January 2019 and July 2020. 24 | 25 | ## Research Questions 26 | 27 | 1. **Comment Engagement Analysis**: Can a statistically significant relationship be demonstrated between a WSJ article's degree of subjectivity/objectivity and positivity/negativity in its writing and the number of online comments posted by readers? 28 | 29 | 2. **Market Prediction Analysis**: Can a statistically significant relationship be demonstrated between WSJ articles' sentiment polarity on day t and S&P 500 Index movements on day t + n (where 0 ≤ n ≤ 3)? 30 | 31 | ## Data Collection 32 | 33 | - **Source**: Wall Street Journal [news archives](https://www.wsj.com/news/archive/years) 34 | - **Method**: Python Selenium web scraping 35 | - **Dataset**: 22,772 full-text articles (Jan 2019 - July 2020) 36 | - **Data Points per Article**: 37 | - Article text and headline 38 | - Sub-headline and publication date 39 | - Author name and rubric category 40 | - Number of comments 41 | 42 | ## Natural Language Processing 43 | 44 | ### Libraries Used 45 | - **VADER** (Valence Aware Dictionary and sEntiment Reasoner) 46 | - **TextBlob** 47 | 48 | ### VADER Analysis 49 | - **Purpose**: Polarity and emotion intensity scoring 50 | - **Output Variables**: `negative`, `neutral`, `positive`, `compound` 51 | - **Documentation**: [VADER Sentiment](https://pypi.org/project/vaderSentiment/) 52 | 53 | ### TextBlob Analysis 54 | - **Purpose**: Sentiment analysis and subjectivity scoring 55 | - **Output Variables**: `polarity`, `subjectivity` 56 | - **Documentation**: [TextBlob Documentation](https://textblob.readthedocs.io/en/dev/) 57 | 58 | ## Research Findings 59 | 60 | ### Comment Engagement Results 61 | - **Model Performance**: Simple linear regression shows poor predictive power (Adj R² = 0.014) 62 | - **Statistical Significance**: Cannot reject null hypothesis (p-value = 0.2045) 63 | - **Key Finding**: VADER negativity scores are statistically significant at 1% level 64 | - **Interpretation**: Higher negativity may correlate with events generating public response (e.g., public figure deaths) 65 | 66 | ### Market Prediction Results 67 | - **Model Performance**: Low predictive power across all models (Adj R² ≈ 0.01) 68 | - **Analysis Scope**: Four regression models testing same-day and next-day S&P 500 movements 69 | - **Key Finding**: TextBlob polarity shows significance at 10% level 70 | - **Conclusion**: WSJ sentiment has limited predictive power for market movements 71 | 72 | ## Project Structure 73 | 74 | ``` 75 | WSJ_WebScraping_NLP/ 76 | ├── app/ # R Shiny application 77 | │ ├── global.R 78 | │ ├── server.R 79 | │ └── ui.R 80 | ├── data/ # Processed datasets 81 | │ └── wsjsections.csv 82 | ├── notebooks/ # Jupyter analysis notebooks 83 | │ └── WSJ_Scraping NLP_Analysis.ipynb 84 | ├── scraping/ # Web scraping scripts 85 | │ └── scrape.py 86 | ├── Dockerfile # Container configuration 87 | ├── README.md # Project documentation 88 | └── objectives.md # Detailed project objectives 89 | ``` 90 | 91 | ## Getting Started 92 | 93 | ### Prerequisites 94 | - Python 3.9+ 95 | - R (for Shiny app) 96 | - Docker (optional) 97 | 98 | ### Local Setup 99 | 1. Clone the repository 100 | 2. Install Python dependencies: `pip install -r requirements.txt` 101 | 3. Run the Jupyter notebook for analysis 102 | 4. Launch R Shiny app for interactive visualization 103 | 104 | ## Docker Setup 105 | 106 | ### Building the Container 107 | ```bash 108 | docker build -t wsj-nlp-analysis . 109 | ``` 110 | 111 | ### Running the Container 112 | ```bash 113 | docker run -p 8888:8888 wsj-nlp-analysis 114 | ``` 115 | 116 | ### Accessing the Application 117 | - Open your browser and navigate to `http://localhost:8888` 118 | - The Jupyter notebook interface will be available 119 | - Use the provided token for authentication 120 | 121 | ### Stopping the Container 122 | ```bash 123 | docker stop 124 | ``` 125 | 126 | ## Results & Applications 127 | 128 | ### Interactive Dashboard 129 | - **R Shiny App**: [Live Application](https://philippe1.shinyapps.io/WSJApp2/) 130 | - **Features**: Interactive sentiment analysis visualization and data exploration 131 | 132 | ### Documentation 133 | - **Blog Post**: [Detailed Analysis](https://nycdatascience.com/blog/student-works/scraping-wall-street-journal-article-data-to-measure-online-reader-engagement-an-nlp-analysis/) 134 | - **Objectives**: See `objectives.md` for detailed project goals and methodology 135 | 136 | -------------------------------------------------------------------------------- /objectives.md: -------------------------------------------------------------------------------- 1 | # Project Objectives 2 | 3 | ## Primary Research Goals 4 | 5 | ### 1. Sentiment-Engagement Relationship Analysis 6 | **Objective**: Investigate whether article sentiment characteristics influence reader engagement on Wall Street Journal articles. 7 | 8 | **Specific Aims**: 9 | - Determine if subjective vs. objective writing style correlates with comment volume 10 | - Analyze the relationship between positive/negative sentiment and reader engagement 11 | - Identify which sentiment metrics (VADER vs. TextBlob) provide better predictive power 12 | - Quantify the statistical significance of sentiment-engagement relationships 13 | 14 | **Success Metrics**: 15 | - Statistical significance testing (p < 0.05) 16 | - Model performance evaluation (R², adjusted R²) 17 | - Identification of key sentiment variables driving engagement 18 | 19 | ### 2. Market Prediction Analysis 20 | **Objective**: Explore whether WSJ article sentiment can predict S&P 500 market movements. 21 | 22 | **Specific Aims**: 23 | - Test predictive power of same-day sentiment on same-day market returns 24 | - Analyze lagged effects (1-3 day prediction windows) 25 | - Compare different sentiment analysis approaches for market prediction 26 | - Control for market volume and other confounding variables 27 | 28 | **Success Metrics**: 29 | - Statistical significance of sentiment variables in market prediction models 30 | - Model performance comparison across different time horizons 31 | - Identification of optimal sentiment indicators for market prediction 32 | 33 | ## Technical Objectives 34 | 35 | ### 3. Data Collection & Processing 36 | **Objective**: Build a robust web scraping and data processing pipeline. 37 | 38 | **Specific Aims**: 39 | - Scrape comprehensive article metadata from WSJ archives 40 | - Implement reliable data cleaning and preprocessing workflows 41 | - Ensure data quality and consistency across the dataset 42 | - Create reproducible data collection processes 43 | 44 | **Success Metrics**: 45 | - Complete dataset of 20,000+ articles 46 | - High data quality (minimal missing values, consistent formatting) 47 | - Reproducible scraping pipeline 48 | 49 | ### 4. Natural Language Processing Implementation 50 | **Objective**: Apply state-of-the-art NLP techniques for sentiment analysis. 51 | 52 | **Specific Aims**: 53 | - Implement VADER sentiment analysis for emotion intensity scoring 54 | - Apply TextBlob for polarity and subjectivity analysis 55 | - Compare and validate different NLP approaches 56 | - Create comprehensive sentiment feature engineering 57 | 58 | **Success Metrics**: 59 | - Successful implementation of both VADER and TextBlob 60 | - Consistent sentiment scoring across the dataset 61 | - Validation of sentiment analysis accuracy 62 | 63 | ### 5. Statistical Analysis & Modeling 64 | **Objective**: Conduct rigorous statistical analysis to test research hypotheses. 65 | 66 | **Specific Aims**: 67 | - Perform linear regression analysis for engagement prediction 68 | - Implement time-series analysis for market prediction 69 | - Apply appropriate statistical tests and model validation 70 | - Control for confounding variables and bias 71 | 72 | **Success Metrics**: 73 | - Properly specified statistical models 74 | - Appropriate handling of statistical assumptions 75 | - Clear interpretation of results and limitations 76 | 77 | ## Methodological Objectives 78 | 79 | ### 6. Reproducible Research 80 | **Objective**: Ensure all analysis is fully reproducible and well-documented. 81 | 82 | **Specific Aims**: 83 | - Create clear documentation for all analysis steps 84 | - Implement version control for code and data 85 | - Provide detailed methodology descriptions 86 | - Share code and data where appropriate 87 | 88 | **Success Metrics**: 89 | - Complete code documentation 90 | - Reproducible analysis notebooks 91 | - Clear methodology documentation 92 | 93 | ### 7. Interactive Visualization 94 | **Objective**: Create user-friendly interfaces for exploring results. 95 | 96 | **Specific Aims**: 97 | - Develop R Shiny dashboard for interactive analysis 98 | - Create visualizations for sentiment trends and patterns 99 | - Enable user exploration of specific articles and time periods 100 | - Provide intuitive data exploration tools 101 | 102 | **Success Metrics**: 103 | - Functional interactive dashboard 104 | - Clear and informative visualizations 105 | - User-friendly interface design 106 | 107 | ## Research Questions 108 | 109 | ### Primary Questions 110 | 1. **Engagement Question**: Does article sentiment (subjectivity, polarity, emotionality) significantly predict the number of comments posted on WSJ articles? 111 | 112 | 2. **Market Question**: Can WSJ article sentiment on day t predict S&P 500 returns on day t + n (where n = 0, 1, 2, 3)? 113 | 114 | ### Secondary Questions 115 | 3. **Sentiment Comparison**: Which sentiment analysis approach (VADER vs. TextBlob) provides better predictive power for engagement and market movements? 116 | 117 | 4. **Temporal Patterns**: Are there temporal patterns in sentiment that correlate with market volatility or reader engagement? 118 | 119 | 5. **Content Analysis**: Do certain types of articles (by section, author, or topic) show stronger sentiment-engagement or sentiment-market relationships? 120 | 121 | ## Expected Outcomes 122 | 123 | ### Positive Outcomes 124 | - Identification of significant sentiment-engagement relationships 125 | - Discovery of predictive sentiment patterns for market movements 126 | - Development of robust NLP analysis pipeline 127 | - Creation of valuable dataset for future research 128 | 129 | ### Potential Limitations 130 | - Limited predictive power due to market complexity 131 | - Potential confounding variables not controlled for 132 | - Temporal limitations of the dataset (2019-2020) 133 | - Possible selection bias in comment engagement 134 | 135 | ## Success Criteria 136 | 137 | ### Minimum Viable Results 138 | - Complete data collection and processing pipeline 139 | - Successful implementation of sentiment analysis 140 | - Statistical analysis of both research questions 141 | - Basic visualization and reporting of results 142 | 143 | ### Optimal Results 144 | - Statistically significant findings for at least one research question 145 | - Strong model performance (R² > 0.1) for engagement prediction 146 | - Identification of actionable insights for content strategy 147 | - Publication-quality analysis and documentation 148 | 149 | ## Timeline & Milestones 150 | 151 | ### Phase 1: Data Collection (Weeks 1-2) 152 | - Set up web scraping infrastructure 153 | - Collect initial dataset 154 | - Implement data cleaning and preprocessing 155 | 156 | ### Phase 2: NLP Implementation (Weeks 3-4) 157 | - Implement VADER and TextBlob analysis 158 | - Create sentiment feature engineering pipeline 159 | - Validate sentiment analysis results 160 | 161 | ### Phase 3: Statistical Analysis (Weeks 5-6) 162 | - Conduct engagement prediction analysis 163 | - Perform market prediction analysis 164 | - Apply appropriate statistical tests 165 | 166 | ### Phase 4: Visualization & Reporting (Weeks 7-8) 167 | - Develop R Shiny dashboard 168 | - Create comprehensive visualizations 169 | - Document findings and methodology 170 | - Prepare final report and presentation 171 | -------------------------------------------------------------------------------- /scraping/scrape.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import os 4 | import sys 5 | import time 6 | 7 | from selenium import webdriver 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.common.exceptions import ( 13 | NoSuchElementException, 14 | StaleElementReferenceException, 15 | ) 16 | from webdriver_manager.chrome import ChromeDriverManager 17 | 18 | logging.basicConfig( 19 | format="%(asctime)s, %(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", 20 | datefmt="%Y-%m-%d:%H:%M:%S", 21 | level=logging.INFO, 22 | stream=sys.stdout, 23 | ) 24 | 25 | URL = "https://www.wsj.com/news/archive/2020/march" 26 | ARG_WINDOW_SIZE = "--window-size=1920,1080" 27 | 28 | 29 | class SeleniumScraper: 30 | def __init__(self): 31 | self.url = URL 32 | self.driver = self.create_driver() 33 | 34 | def _create_options(self): 35 | # Set Chrome browser options 36 | self.chrome_options = Options() 37 | self.chrome_options.add_argument(ARG_WINDOW_SIZE) 38 | prefs = {"profile.managed_default_content_settings.images": 2} 39 | self.chrome_options.add_experimental_option("prefs", prefs) 40 | return self.chrome_options 41 | 42 | def create_driver(self): 43 | # Create Chrome browser options 44 | self._create_options() 45 | # Create webdriver 46 | driver = webdriver.Chrome( 47 | ChromeDriverManager().install(), options=self.chrome_options 48 | ).get(URL) 49 | return driver 50 | 51 | def wait(self, secs=2): 52 | time.sleep(secs) 53 | 54 | 55 | class ScrapeFlow(SeleniumScraper): 56 | def __init__(self): 57 | super().__init__() 58 | self.user = os.environ.get("USER") 59 | self.pw = os.environ.get("PASS") 60 | self._prep_output_file("wsj_articles.csv") 61 | 62 | def main(self): 63 | self.signin() 64 | self.wait(1) 65 | self.get_daylinks() 66 | self.wait(2) 67 | self.parse_daylinks() 68 | 69 | def _prep_output_file(self, filename): 70 | self.csv_file = open(filename, "w", encoding="utf-8", newline="") 71 | self.writer = csv.writer(self.csv_file) 72 | 73 | def signin(self): 74 | """Send username and password env vars to signin form fields and press submit button""" 75 | # Click signin button 76 | sign_in_link = self.driver.find_element_by_link_text("Sign In") 77 | sign_in_link.click() 78 | self.wait(2) 79 | # Find username and pw fields 80 | username = WebDriverWait(self.driver, 10).until( 81 | EC.element_to_be_clickable((By.ID, "username")) 82 | ) 83 | password = WebDriverWait(self.driver, 10).until( 84 | EC.element_to_be_clickable((By.ID, "password")) 85 | ) 86 | # Input username and pw 87 | username.send_keys(self.user) 88 | password.send_keys(self.pw) 89 | # Find and click submit button once username and pw inputted 90 | submit_button = self.driver.find_element_by_xpath( 91 | ".//button[@type='submit'][@class='solid-button basic-login-submit']" 92 | ) 93 | submit_button.click() 94 | 95 | def get_daylinks(self): 96 | self.daylinks = self.driver.find_elements_by_xpath( 97 | '//a[@class="WSJTheme--day-link--19pByDpZ "][@href]' 98 | ) 99 | 100 | def find_text_by_xpath(self, pattern: str) -> str: 101 | """Helper for finding text stored under xpath pattern""" 102 | try: 103 | text_output = self.driver.find_element_by_xpath(pattern).text 104 | except (NoSuchElementException, StaleElementReferenceException): 105 | text_output = "" 106 | return text_output 107 | 108 | def parse_daylinks(self): 109 | """Iterate over scraped daylinks to get fields of interest for each article""" 110 | for i in range(11, len(self.daylinks)): 111 | # Get all sub daylinks by xpath 112 | daylinks2 = WebDriverWait(self.driver, 10).until( 113 | EC.presence_of_all_elements_located( 114 | (By.XPATH, '//a[@class="WSJTheme--day-link--19pByDpZ "][@href]') 115 | ) 116 | ) 117 | logging.info("DayLinks2 is:", daylinks2) 118 | self.wait(1) 119 | daylinks2[i].click() 120 | self.wait(1.5) 121 | 122 | # Find headline links 123 | linkslist1 = None 124 | while not linkslist1: 125 | try: 126 | linkslist1 = self.driver.find_elements_by_xpath( 127 | './/h2[@class="WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E "]//a[@href]' 128 | ) 129 | except: 130 | continue 131 | logging.info("Length of linkslist1 is:", len(linkslist1)) 132 | self.wait(2) 133 | 134 | for i in range(0, len(linkslist1)): 135 | self.wait(2) 136 | linkslist = None 137 | while not linkslist: 138 | try: 139 | linkslist = self.driver.find_elements_by_xpath( 140 | './/h2[@class="WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E "]//a[@href]' 141 | ) 142 | except: 143 | continue 144 | logging.info("Length of linkslist is:", len(linkslist)) 145 | self.wait(2) 146 | try: 147 | linkslist[i].click() 148 | logging.info( 149 | "Trying to click the following web element:", linkslist[i] 150 | ) 151 | self.wait(1) 152 | try: 153 | article_string = "" 154 | text1 = self.driver.find_elements_by_xpath( 155 | ".//div[@class='article-content ']//p" 156 | ) 157 | for ele in text1: 158 | article_string += ele.text 159 | except ( 160 | NoSuchElementException, 161 | StaleElementReferenceException, 162 | ) as e: 163 | article_string = "" 164 | pass 165 | 166 | # Get article fields of interest 167 | article_headline = self.find_text_by_xpath( 168 | './/h1[@class="wsj-article-headline"]' 169 | ) 170 | article_subheadline = self.find_text_by_xpath( 171 | './/h2[@class="sub-head"]' 172 | ) 173 | article_published_date = self.find_text_by_xpath( 174 | ".//time[@class='timestamp article__timestamp flexbox__flex--1']" 175 | ) 176 | article_author = self.find_text_by_xpath( 177 | './/button[@class="author-button"]' 178 | ) 179 | article_topic = self.find_text_by_xpath( 180 | './/li[@class="article-breadCrumb"][1]/a' 181 | ) 182 | article_number_comments = self.find_text_by_xpath( 183 | './/a[@id ="article-comments-tool"]/span' 184 | ) 185 | # Prepare row output 186 | article_dict = { 187 | "article_body_text": article_string, 188 | "article_headline": article_headline, 189 | "article_subheadline": article_subheadline, 190 | "article_published_date": article_published_date, 191 | "author": article_author, 192 | "topic": article_topic, 193 | "article_number_comments": article_number_comments, 194 | } 195 | # Write results 196 | self.writer.writerow(article_dict.values()) 197 | self.driver.back() 198 | except: 199 | logging.info("Failed to click on", linkslist[i]) 200 | continue 201 | self.driver.back() 202 | 203 | 204 | if __name__ == "__main__": 205 | start_time = time.time() 206 | sf = ScrapeFlow() 207 | sf.main() 208 | logging.info(f'{time.time() - start_time} sec to scrape articles') 209 | -------------------------------------------------------------------------------- /app/ui.R: -------------------------------------------------------------------------------- 1 | library(DT) 2 | library(shinydashboard) 3 | library(devtools) 4 | library(shinythemes) 5 | library(dashboardthemes) 6 | library(wordcloud) 7 | library(tidyverse) 8 | library(tidytext) 9 | library(shinythemes) 10 | library(shiny) 11 | 12 | options(width = 1000) 13 | 14 | shinyUI(dashboardPage( 15 | dashboardHeader(title = "Web Scraping Project #2" , titleWidth = 250), 16 | 17 | dashboardSidebar( 18 | width = 300, 19 | sidebarUserPanel(h5("Philippe Heitzmann"), subtitle = "NYCDSA Bootcamp student" ,image = 'philippeheitzmann.jpeg' ), 20 | sidebarMenu( 21 | menuItem("Research Questions & Literature", tabName = 'research1', icon = icon('question'), badgeLabel = "pt1", badgeColor = "teal"), 22 | menuItem("Full Text Article Scraping", icon = icon("window-maximize"), tabName = "proj1", 23 | badgeLabel = "pt1", badgeColor = "teal"), 24 | menuItem("WSJ Data - EDA", icon = icon("window-maximize"), tabName = "wsj", 25 | badgeLabel = "pt2", badgeColor = "light-blue"), 26 | menuItem("Data Cleaning & NLP", icon = icon("window-maximize"), tabName = "wsj2", 27 | badgeLabel = "pt2", badgeColor = "light-blue"), 28 | menuItem("TextBlob NLP Analysis ", icon = icon("window-restore"), tabName = "proj2", 29 | badgeLabel = "pt3", badgeColor = "purple"), 30 | menuItem("VADER NLP Analysis", icon = icon("window-maximize"), tabName = "wsj4", 31 | badgeLabel = "pt3", badgeColor = "purple"), 32 | menuItem("Regression Analysis: Comments", icon = icon("window-maximize"), tabName = "wsj5", 33 | badgeLabel = "pt4", badgeColor = "blue"), 34 | menuItem("Regression Analysis: S&P 500", icon = icon("window-maximize"), tabName = "wsj3", 35 | badgeLabel = "pt4", badgeColor = "blue")) 36 | ), 37 | dashboardBody( 38 | shinyDashboardThemes(theme = "purple_gradient"), 39 | tabItems( 40 | tabItem(tabName = 'research1', 41 | fluidRow(box(title = "Research Questions", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 42 | background = 'light-blue', htmlOutput('rq1'))), 43 | fluidRow(box(title = "Theory", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 44 | background = 'light-blue', textOutput('rq2'))), 45 | fluidRow(box(title = "Hypothesis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 46 | background = 'light-blue', htmlOutput('rq3'))), 47 | column(4, box(title = "Aman, 2013", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, 48 | background = 'teal', img(src = 'Aman 2013.png', height = 150, width = 290,textOutput('rq4')))), 49 | column(4, box(title = "Strycharz et al., 2018", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, 50 | background = 'teal', img(src = 'Strycharz et al., 2018.png', height = 150, width = 290, textOutput('rq5')))), 51 | column(4, box(title = "Engelberg & Parsons, 2011", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, 52 | background = 'teal', img(src = 'Engelberg & Parsons, 2011 .png', height = 150, width = 290, textOutput('rq6'))))), 53 | 54 | tabItem(tabName = 'proj1', 55 | fluidRow(box(title = "Scraping Process", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 56 | background = 'light-blue', textOutput('rq7'))), 57 | column(6, box(title = "Single Day WSJ News Archive", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 58 | background = 'light-blue', img(src = 'WSJ Archives.png', height = 300, width = 475))), 59 | column(6, box(title = "WSJ News Article", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, 60 | background = 'light-blue', img(src = 'WSJ Article.png', height = 300, width = 475))), 61 | column(12, box(title = "Types of Variables Scraped", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, 62 | background = 'teal', (DT::dataTableOutput("table1"))))), 63 | tabItem(tabName = 'wsj', 64 | fluidRow(box(title = "Wall Street Journal Exploratory Data Analysis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq14'))), 65 | 66 | column(6, (box(sliderInput('maxwords', '# of words', min = 50, max = 400, value = 400), width = 12))), 67 | 68 | column(6, (box(sliderInput('sections', '# of sections', min = 5, max = 15, value = 10), width = 12))), 69 | 70 | column(6, (box(plotOutput("wordcloud1"), width = 12))), 71 | 72 | column(6, (box(plotOutput("plot2"), width = 12)))), 73 | tabItem(tabName = 'wsj2', 74 | fluidRow(box(title = "Wall Street Journal Data - Cleaning & Merging Process", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq8'))), 75 | column(6, box(title = 'Downloading SPX Data from Yahoo Finance', status = "primary", solidHeader = TRUE, background = 'light-blue', collapsible = TRUE, width = 12, img(src='Yahoo Finance.png', height = 240, width = 475))), 76 | column(6, box(title = 'Merging the dataframes', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq9'))), 77 | column(6, box(title = 'NLP Analysis - TextBlob', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq10'))), 78 | column(6, box(title = 'NLP Analysis - VADER', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq11')))), 79 | tabItem(tabName = 'wsj3', 80 | fluidRow(box(title = "S&P Regression Analysis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq12'))), 81 | column(4, box(title = "TextBlob Independent Variables Only", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', img(src = "WSJ Paragraph Results.png", height = 300, width = 290))), 82 | column(4, box(title = "VADER Independent Variables Only", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'vaderonlyspx.png', height = 300, width = 290), width = 12)), 83 | column(4, box(title = "All Sentiment Analysis Variables", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'allvariablesspx.png', height = 300, width = 290), width = 12))), 84 | tabItem(tabName = 'proj2', 85 | column(6, box(title = "TextBlob Polarity Scores", status = "info", solidHeader = TRUE, collapsible = TRUE, textOutput('rq15'), background = 'light-blue', width = 12)), 86 | column(6, box(title = "TextBlob Subjectivity Scores", status = "info", solidHeader = TRUE, collapsible = TRUE, textOutput('rq16'), background = 'light-blue', width = 12)), 87 | column(6, box(plotOutput('plot3'), width = 12)), 88 | column(6, box(plotOutput('plot4'), width = 12))), 89 | tabItem(tabName = 'wsj5', 90 | column(6, box(title = "All NLP Variables as Independent Variables", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', htmlOutput('rq21'), width = 12)), 91 | column(6, box(title = "VADER Negativity only as Independent Variable", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', htmlOutput('rq22'), width = 12)), 92 | column(6, box(title = "All NLP Variables as Independent Variables", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'allvars.png', height = 400, width = 475), width = 12)), 93 | column(6, box(title = "VADER Negativity only as Independent Variable", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'negativeonly.png', height = 400, width = 475), width = 12))), 94 | tabItem(tabName = 'wsj4', 95 | column(6, box(title = "VADER Positivity Score", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', textOutput('rq19'), width = 12)), 96 | column(6, box(title = "VADER Negativity Score", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', textOutput('rq20'), width = 12)), 97 | column(6, box(plotOutput('plot5'), width = 12)), 98 | column(6, box(plotOutput('plot6'), width = 12))) 99 | )) 100 | )) 101 | 102 | -------------------------------------------------------------------------------- /app/server.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | library(dplyr) 3 | library(ggplot2) 4 | library(wordcloud) 5 | library(tidyverse) 6 | library(tidytext) 7 | library(shinythemes) 8 | library(shiny) 9 | 10 | shinyServer(function(input, output){ 11 | output$rq1 <- renderUI({ 12 | HTML("Research Questions:

1) Number of Comments
Can a statistically significant, causal relationship be demonstrated between a Wall Street Journal (\"WSJ\") article's degree of subjectivity/objectivity and positivity/negativity in its writing (as defined by widely used sentiment analysis libraries), and the number of online comments posted by readers for that article?

2) S&P 500
Can a statistically significant causal relationship be demonstrated between the WSJ's coverage of financial news, specifically WSJ articles’ degree of subjectivity/objectivity and positivity/negativity polarity on a given day t as a whole, and stock price movements from the S&P 500 Index on that same day t? ") 13 | }) 14 | output$rq2 <- renderText({ 15 | paste0("Theory: Information dissemination is key component of markets efficiency Efficient Markets Hypothesis states share prices should reflect all relevant information, and media outlets like the WSJ play a key in disseminating information.") 16 | }) 17 | output$rq3 <- renderUI({ 18 | HTML("Hypothesis #1: I would expect articles that score higher on the subjectivity and polarity indices to be more likely to be shared widely as I would expect those more biased & emotional articles to engender a greater response and to be shared more widely amongst groups that share similar political opinions and views in a sort of \"echo chamber\" effect, therefore having a higher likelihood of being read and commented on.

Hypothesis #2: Given the WSJ’s wide readership in the financial world, its wide coverage of financial news and previous findings in the literature indicating a weak relationship between frequency, objectivity and emotionality of media coverage and financial markets performance, I would expect some sort of weak relationship between WSJ news coverage and stock market performance.") 19 | }) 20 | output$rq4 <- renderText({ 21 | paste0("Our evidence clearly indicates that crash frequency increases with media coverage and its seasonal concentration. This key finding supports the notion that intensive media reports on a firm provoke extremely large reactions in the market to corporate news.") 22 | }) 23 | output$rq5 <- renderText({ 24 | paste0("A positive relation of the amount of coverage and emotionality with the fluctuation of stock prices was detected for Shell and Philips. In addition, corporate topics were found to positively Granger cause stock price fluctuation, particularly for Philips. The study advances past research in showing that the prediction of stock price fluctuation based on media coverage can be improved by including sentiment, emotionality, and corporate topics.") 25 | }) 26 | output$rq6 <- renderText({ 27 | paste0("For all earnings announcements of S&P 500 Index firms, we find that local media coverage strongly predicts local trading, after controlling for earnings, investor, and newspaper characteristics. Moreover, local trading is strongly related to the timing of local reporting, a particular challenge to nonmedia explanations.") 28 | }) 29 | output$rq7 <- renderText({ 30 | paste0("With the goal of collecting as much text data as possible for sentiment analysis, 22,772 full text WSJ articles published between 2019-2020 were scraped off the Wall Street Journal's 'News Archive' section.") 31 | }) 32 | output$rq8 <- renderText({ 33 | paste0("As the end goal for our S&P regression analysis was to concatenate all paragraph text data for a given day as a single cell value linked to a single day, empty text observations did not have to be deleted, which reduced data processing time considerably. Text data of paragraphs published on the same day were concatenated using groupby and by applying a lambda join function. The resulting data was merged with a dataframe of stock prices and trading volumes for a given day for our S&P regression analysis.") 34 | }) 35 | output$rq9 <- renderUI({ 36 | HTML("DataFrame1: Scraped dataframe of 232 unique dates with scraped WSJ paragraph text, headline text, sub-headline text, date published, author name, section name and number of comments data

DataFrame2: A dataframe of historical SPX data of open, close, volumes from finance.yahoo.com

Merged DataFrame: Using an inner join of the two dataframes, the resulting dataframe has 158 unique observations with article text and SPX data

The merged dataframe necessarily has less observations due to equity markets being closed on weekends and holidays.") 37 | }) 38 | output$rq10 <- renderUI({ 39 | HTML("TextBlob: Python API for common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation

Polarity: float lying in the range of [-1,1] where 1 means positive statement and -1 means a negative statement

Subjectivity: float lying in the range of [0,1] where 1 means subjective statement and 0 means an objective statement.") 40 | }) 41 | output$rq11 <- renderUI({ 42 | HTML("Valence Aware Dictionary and sEntiment Reasoner (“VADER”): NLP sentiment analysis model available through NLTK package that outputs polarity (positive/negative) and intensity of emotion

Negative: float in the range [0,1] representing negativity score

Neutral: float in the range [0,1] representing neutrality score

Positive: float in the range [0,1] representing positivity score

Compound: Computed by normalizing the negative, neutral and positive scores. ") 43 | }) 44 | output$rq12 <- renderText({ 45 | paste0("The Adj R^2 values of ~0.01 and high p-values shows these are poor models with low predictive power, even with the VADER neutral values and SPX Volume numbers added as extra independent variables. As expected, our results show WSJ sentiment analysis has low predictive power in relation to SPX moves, although, quite interestingly, TextBlob polarity of WSJ article is significant to the 10% level. This finding is interesting in the context of our previous EDA of polarity vs number of comments of article comments which had a quasi-flat relationship. Perhaps articles that score higher on the polarity index fall on more volatile days in the stock market, leading to more emotionally polarized articles. There needs to be more data points in our analysis, however, to drive better, more statistically significant results, as a next step for this analysis going forward.") 46 | }) 47 | output$rq13 <- renderText({ 48 | paste0("As a next step, I would want to increase the number of datapoints and limiting our text data to only scraping headlines of the WSJ and other publications in order to see if this approach would yield more interesting and statistically significant results...") 49 | }) 50 | output$plot2 <- renderPlot({ 51 | head(sectionsdf, input$sections) %>% ggplot(., aes(reorder(SectionName, AverageComments), AverageComments)) + geom_bar(aes(fill = AverageComments), stat = 'identity') + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Sections with Highest Average Number of Comments per article") + xlab('Section Name') + ylab("Average Number of Comments per article") 52 | }) 53 | output$rq14 <- renderText({ 54 | paste0("Interestingly, the sections with the highest average number of comments per article are the Opinion, Politics, U.S. and Influencers sections, which can all be commonly thought of as rubrics dealing with more polarizing and rhetorically biased topics that tend to generate a lot of online discussion and commentary.") 55 | }) 56 | output$plot3 <- renderPlot({ 57 | wsj2 %>% ggplot(., aes(Comments. , polarity)) + geom_point(shape=18, color="blue") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Polarity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Polarity score") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") 58 | }) 59 | output$plot4 <- renderPlot({ 60 | wsj2 %>% ggplot(., aes(Comments. , subjectivity)) + geom_point(shape=18, color="blue") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Subjectivity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Subjectivity score") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") 61 | }) 62 | output$plot5 <- renderPlot({ 63 | wsj3 %>% ggplot(., aes(Comments. , positive)) + geom_point(shape=18, color="green") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Positivity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Positivity score") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") 64 | }) 65 | output$plot6 <- renderPlot({ 66 | wsj3 %>% ggplot(., aes(Comments. , negative)) + geom_point(shape=18, color="green") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Negativity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Negativity score") + geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue") 67 | }) 68 | output$rq15 <- renderText({ 69 | paste0("Polarity of an article doesn't seem to have as much explanatory power in relation to the number of comments generated. Polarity scores for WSJ articles also seem to be very tightly distributed around the mean.") 70 | }) 71 | output$rq16 <- renderText({ 72 | paste0("Subjectivity of an article seems to explain more of the variation in number of comments posted on that article. Subjectivity scores also appear to vary more widely than polarity scores and seem to have more outliers, which may have to be addressed later on.") 73 | }) 74 | #CommentsRegressionDiscussion 75 | output$rq17 <- renderText({ 76 | paste0("Subjectivity of an article seems to explain more of the variation in number of comments posted on that article. Subjectivity scores also appear to vary more widely than polarity scores and seem to have more outliers, which may have to be addressed later on.") 77 | }) 78 | output$rq18 <- renderText({ 79 | paste0("Placeholder") 80 | }) 81 | output$rq19 <- renderText({ 82 | paste0("The relationship between positivity score and number of comments is weaker than I expected as I would have thought that more optimistic and feel-good articles would be shared more widely, while the below graph seems to show there is close to no relationship between VADER positivity score and number of comments posted on that article.") 83 | }) 84 | output$rq20 <- renderText({ 85 | paste0("A seemingly high positive correlation between VADER negativity score and number of comments was perhaps the most interesting and surprising finding, at least at first glance, as I would have initially theorized that more pessimistic and negative articles would be read and shared less amongst groups and therefore lead to a lower number of posted comments for that article.") 86 | }) 87 | output$rq21 <- renderUI({ 88 | HTML("As evidenced by the very low adj R^2 of 0.01 this is a poor model for explaining the variance in number of comments posted on WSJ articles.
We cannot reject the null hypothesis that the beta coefficiens of the polarity, subjectivity and positivity variables are zero based on the below.

Interestingly however, as I had expected from our EDA of VADER negativity scores in pt3, negativity scores are statistically significant to the 1% level in predicting the number of comments posted on WSJ articles. ") 89 | }) 90 | output$rq22 <- renderUI({ 91 | HTML("Unsurprisingly, a standalone linear regression model with just the negativity score as a predictor yields a model with a similarly low Adj R^2 value.
Although this would still be a poor model for predicting the number of comments on WSJ articles, the simplicity of this model makes it preferable to the linear model with all sentiment analysis variables included. The fact that an article scoring high on the negativity index has a positive & significant relationship to number of comments may be because articles published by the WSJ that score high on negativity may be more likely to announce events that would see a public showing of grief or support, such as for the announcement of the death of a public figure or some other tragedy that would affect large numbers of people and would therefore generate more comments on that article.") 92 | }) 93 | 94 | output$table1 <- DT::renderDataTable(formatStyle(datatable(variabledata), rownames=FALSE, columns = 1:3, color = 'white')) 95 | 96 | output$wordcloud1 <- renderPlot(wordcloud(docs, scale=c(5,0.5), max.words=input$maxwords, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))) 97 | }) 98 | --------------------------------------------------------------------------------