├── app
    ├── global.R
    ├── ui.R
    └── server.R
├── Dockerfile
├── data
    └── wsjsections.csv
├── .gitignore
├── README.md
├── objectives.md
└── scraping
    └── scrape.py


/app/global.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(forcats)
 3 | library(tm)
 4 | library(wordcloud)
 5 | library(memoise)
 6 | library(SnowballC)
 7 | library(RColorBrewer) 
 8 | library(ggplot2)
 9 | 
10 | # 
11 | variabledata <- read.csv("data/variabletypes.csv")
12 | # wsjtextdata = readLines('data/wsjtextdata.txt')
13 | # docs <- Corpus(VectorSource(wsjtextdata))
14 | 
15 | docs <- Corpus(DirSource('text/'))
16 | 
17 | #docs <- tm_map(docs, stripWhitespace)
18 | 
19 | #docs <- tm_map(docs, tolower)
20 | 
21 | #docs <- tm_map(docs, stemDocument)
22 | 
23 | sectionsdf <- read.csv('data/wsjsections.csv')
24 | sectionsdf = sectionsdf[order(-sectionsdf$AverageComments),]
25 | 
26 | wsj2 = read.csv('data/wsj2.csv')
27 | 
28 | wsj3 = read.csv('data/wsj3.csv')
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.9-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /usr/src/app
 6 | 
 7 | # Install system dependencies
 8 | RUN apt-get update && apt-get install -y \
 9 |     build-essential \
10 |     libffi-dev \
11 |     libssl-dev \
12 |     python3-dev
13 | 
14 | # Copy the current directory contents into the container at /usr/src/app
15 | COPY . .
16 | 
17 | # Install any needed packages specified in requirements.txt
18 | RUN pip install --no-cache-dir -r requirements.txt
19 | 
20 | # Make port 8888 available to the world outside this container
21 | EXPOSE 8888
22 | 
23 | # Run jupyter notebook when the container launches
24 | CMD ["jupyter", "notebook", "--ip='0.0.0.0'", "--port=8888", "--no-browser", "--allow-root"]
25 | 


--------------------------------------------------------------------------------
/data/wsjsections.csv:
--------------------------------------------------------------------------------
 1 | SectionName,AverageComments
 2 | ART REVIEW,4.846153846
 3 | BUSINESS,60.63188168
 4 | OPINION,282.4523077
 5 | U.S.,151.3304647
 6 | LIFE & ARTS,66.1961039
 7 | MARKETS,28.72707424
 8 | BOOKS,15.86723164
 9 | PRO BANKRUPTCY,6
10 | WORLD,119.4365013
11 | ECONOMY,112.479798
12 | REAL ESTATE,34.94545455
13 | WSJ LOGISTICS REPORT,1.4
14 | POLITICS,367.9603399
15 | WSJ. MAGAZINE,6.178571429
16 | RISK & COMPLIANCE JOURNAL,3.707317073
17 | TECH,76.36797753
18 | LOGISTICS REPORT,7.054054054
19 | EXHIBITION REVIEW,4.545454545
20 | CFO JOURNAL,3.895833333
21 | PERSONAL BOARD OF DIRECTORS,6
22 | DANCE REVIEW,1
23 | CMO TODAY,77.20588235
24 | WHATS NEWS BUSINESS FINANCE,1
25 | CIO JOURNAL,5.363636364
26 | UP NEXT,6.875
27 | TELEVISION REVIEW,14.17857143
28 | MANAGEMENT,37.83333333
29 | ARTS,44.82258065
30 | PRO PE EXITS,1
31 | MUSIC REVIEW,3.818181818
32 | PRO VC INDUSTRY NEWS,4
33 | SHIPPING MATTERS,20.25
34 | OPERA REVIEW,2.25
35 | FILM REVIEW,23.47826087
36 | PRO BANKRUPTCY INVESTORS,1
37 | MASTERPIECE,9.818181818
38 | PRO PE DEALS,3
39 | THE ARTIST,3
40 | ARTS & ENTERTAINMENT,11.83333333
41 | PRO VC NEW MONEY,4
42 | FOOD,7
43 | TRENDING,53.5
44 | ARTIFICIAL INTELLIGENCE,8
45 | PRO BANKRUPTCY DISTRESS,37
46 | EXPERIENCE REPORT,10
47 | THE CAPTAIN CLASS,75.5
48 | A-HED,62.43478261
49 | PRO PE LEGAL REGULATORY,15
50 | ON THE COVER,16
51 | PRO VC COMMENTARY ANALYSIS,25
52 | CENTRAL BANKS RESEARCH,48
53 | INFLUENCERS,134


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Python
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.so
  6 | .Python
  7 | build/
  8 | develop-eggs/
  9 | dist/
 10 | downloads/
 11 | eggs/
 12 | .eggs/
 13 | lib/
 14 | lib64/
 15 | parts/
 16 | sdist/
 17 | var/
 18 | wheels/
 19 | pip-wheel-metadata/
 20 | share/python-wheels/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | MANIFEST
 25 | 
 26 | # Virtual environments
 27 | .env
 28 | .venv
 29 | env/
 30 | venv/
 31 | ENV/
 32 | env.bak/
 33 | venv.bak/
 34 | 
 35 | # Jupyter Notebook
 36 | .ipynb_checkpoints
 37 | 
 38 | # IPython
 39 | profile_default/
 40 | ipython_config.py
 41 | 
 42 | # pyenv
 43 | .python-version
 44 | 
 45 | # pipenv
 46 | Pipfile.lock
 47 | 
 48 | # PEP 582
 49 | __pypackages__/
 50 | 
 51 | # Celery
 52 | celerybeat-schedule
 53 | celerybeat.pid
 54 | 
 55 | # SageMath parsed files
 56 | *.sage.py
 57 | 
 58 | # Environments
 59 | .env
 60 | .env.local
 61 | .env.development.local
 62 | .env.test.local
 63 | .env.production.local
 64 | 
 65 | # Spyder project settings
 66 | .spyderproject
 67 | .spyproject
 68 | 
 69 | # Rope project settings
 70 | .ropeproject
 71 | 
 72 | # mkdocs documentation
 73 | /site
 74 | 
 75 | # mypy
 76 | .mypy_cache/
 77 | .dmypy.json
 78 | dmypy.json
 79 | 
 80 | # Pyre type checker
 81 | .pyre/
 82 | 
 83 | # R
 84 | .Rhistory
 85 | .RData
 86 | .Ruserdata
 87 | .Rproj.user/
 88 | 
 89 | # R Shiny
 90 | rsconnect/
 91 | 
 92 | # Data files
 93 | *.csv
 94 | *.xlsx
 95 | *.xls
 96 | *.json
 97 | *.parquet
 98 | *.pickle
 99 | *.pkl
100 | 
101 | # Logs
102 | *.log
103 | logs/
104 | 
105 | # OS generated files
106 | .DS_Store
107 | .DS_Store?
108 | ._*
109 | .Spotlight-V100
110 | .Trashes
111 | ehthumbs.db
112 | Thumbs.db
113 | 
114 | # IDE
115 | .vscode/
116 | .idea/
117 | *.swp
118 | *.swo
119 | *~
120 | 
121 | # Docker
122 | .dockerignore
123 | 
124 | # Temporary files
125 | *.tmp
126 | *.temp
127 | temp/
128 | tmp/
129 | 
130 | # Scraped data (if large)
131 | scraped_data/
132 | raw_data/
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WSJ Web Scraping & NLP Analysis
  2 | 
  3 | A comprehensive analysis of Wall Street Journal articles to investigate relationships between article sentiment, reader engagement, and financial market movements.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Project Overview](#project-overview)
  8 | - [Research Questions](#research-questions)
  9 | - [Data Collection](#data-collection)
 10 | - [Natural Language Processing](#natural-language-processing)
 11 | - [Research Findings](#research-findings)
 12 | - [Project Structure](#project-structure)
 13 | - [Getting Started](#getting-started)
 14 | - [Docker Setup](#docker-setup)
 15 | - [Results & Applications](#results--applications)
 16 | 
 17 | ## Project Overview
 18 | 
 19 | This project investigates the relationship between Wall Street Journal article sentiment and two key metrics:
 20 | - **User engagement** (measured by comment count)
 21 | - **S&P 500 market returns**
 22 | 
 23 | The analysis leverages web scraping and natural language processing to extract insights from 22,772 WSJ articles published between January 2019 and July 2020.
 24 | 
 25 | ## Research Questions
 26 | 
 27 | 1. **Comment Engagement Analysis**: Can a statistically significant relationship be demonstrated between a WSJ article's degree of subjectivity/objectivity and positivity/negativity in its writing and the number of online comments posted by readers?
 28 | 
 29 | 2. **Market Prediction Analysis**: Can a statistically significant relationship be demonstrated between WSJ articles' sentiment polarity on day t and S&P 500 Index movements on day t + n (where 0 ≤ n ≤ 3)?
 30 | 
 31 | ## Data Collection
 32 | 
 33 | - **Source**: Wall Street Journal [news archives](https://www.wsj.com/news/archive/years)
 34 | - **Method**: Python Selenium web scraping
 35 | - **Dataset**: 22,772 full-text articles (Jan 2019 - July 2020)
 36 | - **Data Points per Article**:
 37 |   - Article text and headline
 38 |   - Sub-headline and publication date
 39 |   - Author name and rubric category
 40 |   - Number of comments
 41 | 
 42 | ## Natural Language Processing
 43 | 
 44 | ### Libraries Used
 45 | - **VADER** (Valence Aware Dictionary and sEntiment Reasoner)
 46 | - **TextBlob**
 47 | 
 48 | ### VADER Analysis
 49 | - **Purpose**: Polarity and emotion intensity scoring
 50 | - **Output Variables**: `negative`, `neutral`, `positive`, `compound`
 51 | - **Documentation**: [VADER Sentiment](https://pypi.org/project/vaderSentiment/)
 52 | 
 53 | ### TextBlob Analysis
 54 | - **Purpose**: Sentiment analysis and subjectivity scoring
 55 | - **Output Variables**: `polarity`, `subjectivity`
 56 | - **Documentation**: [TextBlob Documentation](https://textblob.readthedocs.io/en/dev/)
 57 | 
 58 | ## Research Findings
 59 | 
 60 | ### Comment Engagement Results
 61 | - **Model Performance**: Simple linear regression shows poor predictive power (Adj R² = 0.014)
 62 | - **Statistical Significance**: Cannot reject null hypothesis (p-value = 0.2045)
 63 | - **Key Finding**: VADER negativity scores are statistically significant at 1% level
 64 | - **Interpretation**: Higher negativity may correlate with events generating public response (e.g., public figure deaths)
 65 | 
 66 | ### Market Prediction Results
 67 | - **Model Performance**: Low predictive power across all models (Adj R² ≈ 0.01)
 68 | - **Analysis Scope**: Four regression models testing same-day and next-day S&P 500 movements
 69 | - **Key Finding**: TextBlob polarity shows significance at 10% level
 70 | - **Conclusion**: WSJ sentiment has limited predictive power for market movements
 71 | 
 72 | ## Project Structure
 73 | 
 74 | ```
 75 | WSJ_WebScraping_NLP/
 76 | ├── app/                    # R Shiny application
 77 | │   ├── global.R
 78 | │   ├── server.R
 79 | │   └── ui.R
 80 | ├── data/                   # Processed datasets
 81 | │   └── wsjsections.csv
 82 | ├── notebooks/              # Jupyter analysis notebooks
 83 | │   └── WSJ_Scraping NLP_Analysis.ipynb
 84 | ├── scraping/               # Web scraping scripts
 85 | │   └── scrape.py
 86 | ├── Dockerfile             # Container configuration
 87 | ├── README.md              # Project documentation
 88 | └── objectives.md          # Detailed project objectives
 89 | ```
 90 | 
 91 | ## Getting Started
 92 | 
 93 | ### Prerequisites
 94 | - Python 3.9+
 95 | - R (for Shiny app)
 96 | - Docker (optional)
 97 | 
 98 | ### Local Setup
 99 | 1. Clone the repository
100 | 2. Install Python dependencies: `pip install -r requirements.txt`
101 | 3. Run the Jupyter notebook for analysis
102 | 4. Launch R Shiny app for interactive visualization
103 | 
104 | ## Docker Setup
105 | 
106 | ### Building the Container
107 | ```bash
108 | docker build -t wsj-nlp-analysis .
109 | ```
110 | 
111 | ### Running the Container
112 | ```bash
113 | docker run -p 8888:8888 wsj-nlp-analysis
114 | ```
115 | 
116 | ### Accessing the Application
117 | - Open your browser and navigate to `http://localhost:8888`
118 | - The Jupyter notebook interface will be available
119 | - Use the provided token for authentication
120 | 
121 | ### Stopping the Container
122 | ```bash
123 | docker stop <container_id>
124 | ```
125 | 
126 | ## Results & Applications
127 | 
128 | ### Interactive Dashboard
129 | - **R Shiny App**: [Live Application](https://philippe1.shinyapps.io/WSJApp2/)
130 | - **Features**: Interactive sentiment analysis visualization and data exploration
131 | 
132 | ### Documentation
133 | - **Blog Post**: [Detailed Analysis](https://nycdatascience.com/blog/student-works/scraping-wall-street-journal-article-data-to-measure-online-reader-engagement-an-nlp-analysis/)
134 | - **Objectives**: See `objectives.md` for detailed project goals and methodology
135 | 
136 | 


--------------------------------------------------------------------------------
/objectives.md:
--------------------------------------------------------------------------------
  1 | # Project Objectives
  2 | 
  3 | ## Primary Research Goals
  4 | 
  5 | ### 1. Sentiment-Engagement Relationship Analysis
  6 | **Objective**: Investigate whether article sentiment characteristics influence reader engagement on Wall Street Journal articles.
  7 | 
  8 | **Specific Aims**:
  9 | - Determine if subjective vs. objective writing style correlates with comment volume
 10 | - Analyze the relationship between positive/negative sentiment and reader engagement
 11 | - Identify which sentiment metrics (VADER vs. TextBlob) provide better predictive power
 12 | - Quantify the statistical significance of sentiment-engagement relationships
 13 | 
 14 | **Success Metrics**:
 15 | - Statistical significance testing (p < 0.05)
 16 | - Model performance evaluation (R², adjusted R²)
 17 | - Identification of key sentiment variables driving engagement
 18 | 
 19 | ### 2. Market Prediction Analysis
 20 | **Objective**: Explore whether WSJ article sentiment can predict S&P 500 market movements.
 21 | 
 22 | **Specific Aims**:
 23 | - Test predictive power of same-day sentiment on same-day market returns
 24 | - Analyze lagged effects (1-3 day prediction windows)
 25 | - Compare different sentiment analysis approaches for market prediction
 26 | - Control for market volume and other confounding variables
 27 | 
 28 | **Success Metrics**:
 29 | - Statistical significance of sentiment variables in market prediction models
 30 | - Model performance comparison across different time horizons
 31 | - Identification of optimal sentiment indicators for market prediction
 32 | 
 33 | ## Technical Objectives
 34 | 
 35 | ### 3. Data Collection & Processing
 36 | **Objective**: Build a robust web scraping and data processing pipeline.
 37 | 
 38 | **Specific Aims**:
 39 | - Scrape comprehensive article metadata from WSJ archives
 40 | - Implement reliable data cleaning and preprocessing workflows
 41 | - Ensure data quality and consistency across the dataset
 42 | - Create reproducible data collection processes
 43 | 
 44 | **Success Metrics**:
 45 | - Complete dataset of 20,000+ articles
 46 | - High data quality (minimal missing values, consistent formatting)
 47 | - Reproducible scraping pipeline
 48 | 
 49 | ### 4. Natural Language Processing Implementation
 50 | **Objective**: Apply state-of-the-art NLP techniques for sentiment analysis.
 51 | 
 52 | **Specific Aims**:
 53 | - Implement VADER sentiment analysis for emotion intensity scoring
 54 | - Apply TextBlob for polarity and subjectivity analysis
 55 | - Compare and validate different NLP approaches
 56 | - Create comprehensive sentiment feature engineering
 57 | 
 58 | **Success Metrics**:
 59 | - Successful implementation of both VADER and TextBlob
 60 | - Consistent sentiment scoring across the dataset
 61 | - Validation of sentiment analysis accuracy
 62 | 
 63 | ### 5. Statistical Analysis & Modeling
 64 | **Objective**: Conduct rigorous statistical analysis to test research hypotheses.
 65 | 
 66 | **Specific Aims**:
 67 | - Perform linear regression analysis for engagement prediction
 68 | - Implement time-series analysis for market prediction
 69 | - Apply appropriate statistical tests and model validation
 70 | - Control for confounding variables and bias
 71 | 
 72 | **Success Metrics**:
 73 | - Properly specified statistical models
 74 | - Appropriate handling of statistical assumptions
 75 | - Clear interpretation of results and limitations
 76 | 
 77 | ## Methodological Objectives
 78 | 
 79 | ### 6. Reproducible Research
 80 | **Objective**: Ensure all analysis is fully reproducible and well-documented.
 81 | 
 82 | **Specific Aims**:
 83 | - Create clear documentation for all analysis steps
 84 | - Implement version control for code and data
 85 | - Provide detailed methodology descriptions
 86 | - Share code and data where appropriate
 87 | 
 88 | **Success Metrics**:
 89 | - Complete code documentation
 90 | - Reproducible analysis notebooks
 91 | - Clear methodology documentation
 92 | 
 93 | ### 7. Interactive Visualization
 94 | **Objective**: Create user-friendly interfaces for exploring results.
 95 | 
 96 | **Specific Aims**:
 97 | - Develop R Shiny dashboard for interactive analysis
 98 | - Create visualizations for sentiment trends and patterns
 99 | - Enable user exploration of specific articles and time periods
100 | - Provide intuitive data exploration tools
101 | 
102 | **Success Metrics**:
103 | - Functional interactive dashboard
104 | - Clear and informative visualizations
105 | - User-friendly interface design
106 | 
107 | ## Research Questions
108 | 
109 | ### Primary Questions
110 | 1. **Engagement Question**: Does article sentiment (subjectivity, polarity, emotionality) significantly predict the number of comments posted on WSJ articles?
111 | 
112 | 2. **Market Question**: Can WSJ article sentiment on day t predict S&P 500 returns on day t + n (where n = 0, 1, 2, 3)?
113 | 
114 | ### Secondary Questions
115 | 3. **Sentiment Comparison**: Which sentiment analysis approach (VADER vs. TextBlob) provides better predictive power for engagement and market movements?
116 | 
117 | 4. **Temporal Patterns**: Are there temporal patterns in sentiment that correlate with market volatility or reader engagement?
118 | 
119 | 5. **Content Analysis**: Do certain types of articles (by section, author, or topic) show stronger sentiment-engagement or sentiment-market relationships?
120 | 
121 | ## Expected Outcomes
122 | 
123 | ### Positive Outcomes
124 | - Identification of significant sentiment-engagement relationships
125 | - Discovery of predictive sentiment patterns for market movements
126 | - Development of robust NLP analysis pipeline
127 | - Creation of valuable dataset for future research
128 | 
129 | ### Potential Limitations
130 | - Limited predictive power due to market complexity
131 | - Potential confounding variables not controlled for
132 | - Temporal limitations of the dataset (2019-2020)
133 | - Possible selection bias in comment engagement
134 | 
135 | ## Success Criteria
136 | 
137 | ### Minimum Viable Results
138 | - Complete data collection and processing pipeline
139 | - Successful implementation of sentiment analysis
140 | - Statistical analysis of both research questions
141 | - Basic visualization and reporting of results
142 | 
143 | ### Optimal Results
144 | - Statistically significant findings for at least one research question
145 | - Strong model performance (R² > 0.1) for engagement prediction
146 | - Identification of actionable insights for content strategy
147 | - Publication-quality analysis and documentation
148 | 
149 | ## Timeline & Milestones
150 | 
151 | ### Phase 1: Data Collection (Weeks 1-2)
152 | - Set up web scraping infrastructure
153 | - Collect initial dataset
154 | - Implement data cleaning and preprocessing
155 | 
156 | ### Phase 2: NLP Implementation (Weeks 3-4)
157 | - Implement VADER and TextBlob analysis
158 | - Create sentiment feature engineering pipeline
159 | - Validate sentiment analysis results
160 | 
161 | ### Phase 3: Statistical Analysis (Weeks 5-6)
162 | - Conduct engagement prediction analysis
163 | - Perform market prediction analysis
164 | - Apply appropriate statistical tests
165 | 
166 | ### Phase 4: Visualization & Reporting (Weeks 7-8)
167 | - Develop R Shiny dashboard
168 | - Create comprehensive visualizations
169 | - Document findings and methodology
170 | - Prepare final report and presentation
171 | 


--------------------------------------------------------------------------------
/scraping/scrape.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | import os
  4 | import sys
  5 | import time
  6 | 
  7 | from selenium import webdriver
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | from selenium.webdriver.chrome.options import Options
 12 | from selenium.common.exceptions import (
 13 |     NoSuchElementException,
 14 |     StaleElementReferenceException,
 15 | )
 16 | from webdriver_manager.chrome import ChromeDriverManager
 17 | 
 18 | logging.basicConfig(
 19 |     format="%(asctime)s, %(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
 20 |     datefmt="%Y-%m-%d:%H:%M:%S",
 21 |     level=logging.INFO,
 22 |     stream=sys.stdout,
 23 | )
 24 | 
 25 | URL = "https://www.wsj.com/news/archive/2020/march"
 26 | ARG_WINDOW_SIZE = "--window-size=1920,1080"
 27 | 
 28 | 
 29 | class SeleniumScraper:
 30 |     def __init__(self):
 31 |         self.url = URL
 32 |         self.driver = self.create_driver()
 33 | 
 34 |     def _create_options(self):
 35 |         # Set Chrome browser options
 36 |         self.chrome_options = Options()
 37 |         self.chrome_options.add_argument(ARG_WINDOW_SIZE)
 38 |         prefs = {"profile.managed_default_content_settings.images": 2}
 39 |         self.chrome_options.add_experimental_option("prefs", prefs)
 40 |         return self.chrome_options
 41 | 
 42 |     def create_driver(self):
 43 |         # Create Chrome browser options
 44 |         self._create_options()
 45 |         # Create webdriver
 46 |         driver = webdriver.Chrome(
 47 |             ChromeDriverManager().install(), options=self.chrome_options
 48 |         ).get(URL)
 49 |         return driver
 50 | 
 51 |     def wait(self, secs=2):
 52 |         time.sleep(secs)
 53 | 
 54 | 
 55 | class ScrapeFlow(SeleniumScraper):
 56 |     def __init__(self):
 57 |         super().__init__()
 58 |         self.user = os.environ.get("USER")
 59 |         self.pw = os.environ.get("PASS")
 60 |         self._prep_output_file("wsj_articles.csv")
 61 | 
 62 |     def main(self):
 63 |         self.signin()
 64 |         self.wait(1)
 65 |         self.get_daylinks()
 66 |         self.wait(2)
 67 |         self.parse_daylinks()
 68 | 
 69 |     def _prep_output_file(self, filename):
 70 |         self.csv_file = open(filename, "w", encoding="utf-8", newline="")
 71 |         self.writer = csv.writer(self.csv_file)
 72 | 
 73 |     def signin(self):
 74 |         """Send username and password env vars to signin form fields and press submit button"""
 75 |         # Click signin button
 76 |         sign_in_link = self.driver.find_element_by_link_text("Sign In")
 77 |         sign_in_link.click()
 78 |         self.wait(2)
 79 |         # Find username and pw fields
 80 |         username = WebDriverWait(self.driver, 10).until(
 81 |             EC.element_to_be_clickable((By.ID, "username"))
 82 |         )
 83 |         password = WebDriverWait(self.driver, 10).until(
 84 |             EC.element_to_be_clickable((By.ID, "password"))
 85 |         )
 86 |         # Input username and pw
 87 |         username.send_keys(self.user)
 88 |         password.send_keys(self.pw)
 89 |         # Find and click submit button once username and pw inputted
 90 |         submit_button = self.driver.find_element_by_xpath(
 91 |             ".//button[@type='submit'][@class='solid-button basic-login-submit']"
 92 |         )
 93 |         submit_button.click()
 94 | 
 95 |     def get_daylinks(self):
 96 |         self.daylinks = self.driver.find_elements_by_xpath(
 97 |             '//a[@class="WSJTheme--day-link--19pByDpZ "][@href]'
 98 |         )
 99 | 
100 |     def find_text_by_xpath(self, pattern: str) -> str:
101 |         """Helper for finding text stored under xpath pattern"""
102 |         try:
103 |             text_output = self.driver.find_element_by_xpath(pattern).text
104 |         except (NoSuchElementException, StaleElementReferenceException):
105 |             text_output = ""
106 |         return text_output
107 | 
108 |     def parse_daylinks(self):
109 |         """Iterate over scraped daylinks to get fields of interest for each article"""
110 |         for i in range(11, len(self.daylinks)):
111 |             # Get all sub daylinks by xpath
112 |             daylinks2 = WebDriverWait(self.driver, 10).until(
113 |                 EC.presence_of_all_elements_located(
114 |                     (By.XPATH, '//a[@class="WSJTheme--day-link--19pByDpZ "][@href]')
115 |                 )
116 |             )
117 |             logging.info("DayLinks2 is:", daylinks2)
118 |             self.wait(1)
119 |             daylinks2[i].click()
120 |             self.wait(1.5)
121 | 
122 |             # Find headline links
123 |             linkslist1 = None
124 |             while not linkslist1:
125 |                 try:
126 |                     linkslist1 = self.driver.find_elements_by_xpath(
127 |                         './/h2[@class="WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E "]//a[@href]'
128 |                     )
129 |                 except:
130 |                     continue
131 |             logging.info("Length of linkslist1 is:", len(linkslist1))
132 |             self.wait(2)
133 | 
134 |             for i in range(0, len(linkslist1)):
135 |                 self.wait(2)
136 |                 linkslist = None
137 |                 while not linkslist:
138 |                     try:
139 |                         linkslist = self.driver.find_elements_by_xpath(
140 |                             './/h2[@class="WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E "]//a[@href]'
141 |                         )
142 |                     except:
143 |                         continue
144 |                 logging.info("Length of linkslist is:", len(linkslist))
145 |                 self.wait(2)
146 |                 try:
147 |                     linkslist[i].click()
148 |                     logging.info(
149 |                         "Trying to click the following web element:", linkslist[i]
150 |                     )
151 |                     self.wait(1)
152 |                     try:
153 |                         article_string = ""
154 |                         text1 = self.driver.find_elements_by_xpath(
155 |                             ".//div[@class='article-content ']//p"
156 |                         )
157 |                         for ele in text1:
158 |                             article_string += ele.text
159 |                     except (
160 |                         NoSuchElementException,
161 |                         StaleElementReferenceException,
162 |                     ) as e:
163 |                         article_string = ""
164 |                         pass
165 | 
166 |                     # Get article fields of interest
167 |                     article_headline = self.find_text_by_xpath(
168 |                         './/h1[@class="wsj-article-headline"]'
169 |                     )
170 |                     article_subheadline = self.find_text_by_xpath(
171 |                         './/h2[@class="sub-head"]'
172 |                     )
173 |                     article_published_date = self.find_text_by_xpath(
174 |                         ".//time[@class='timestamp article__timestamp flexbox__flex--1']"
175 |                     )
176 |                     article_author = self.find_text_by_xpath(
177 |                         './/button[@class="author-button"]'
178 |                     )
179 |                     article_topic = self.find_text_by_xpath(
180 |                         './/li[@class="article-breadCrumb"][1]/a'
181 |                     )
182 |                     article_number_comments = self.find_text_by_xpath(
183 |                         './/a[@id ="article-comments-tool"]/span'
184 |                     )
185 |                     # Prepare row output
186 |                     article_dict = {
187 |                         "article_body_text": article_string,
188 |                         "article_headline": article_headline,
189 |                         "article_subheadline": article_subheadline,
190 |                         "article_published_date": article_published_date,
191 |                         "author": article_author,
192 |                         "topic": article_topic,
193 |                         "article_number_comments": article_number_comments,
194 |                     }
195 |                     # Write results
196 |                     self.writer.writerow(article_dict.values())
197 |                     self.driver.back()
198 |                 except:
199 |                     logging.info("Failed to click on", linkslist[i])
200 |                     continue
201 |             self.driver.back()
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     start_time = time.time()
206 |     sf = ScrapeFlow()
207 |     sf.main()
208 |     logging.info(f'{time.time() - start_time} sec to scrape articles')
209 | 


--------------------------------------------------------------------------------
/app/ui.R:
--------------------------------------------------------------------------------
  1 | library(DT)
  2 | library(shinydashboard)
  3 | library(devtools)
  4 | library(shinythemes)
  5 | library(dashboardthemes)
  6 | library(wordcloud)
  7 | library(tidyverse)
  8 | library(tidytext)
  9 | library(shinythemes)
 10 | library(shiny)
 11 | 
 12 | options(width = 1000)
 13 | 
 14 | shinyUI(dashboardPage(
 15 |   dashboardHeader(title = "Web Scraping Project #2" , titleWidth = 250),
 16 |   
 17 |   dashboardSidebar(
 18 |     width = 300,
 19 |     sidebarUserPanel(h5("Philippe Heitzmann"), subtitle = "NYCDSA Bootcamp student" ,image = 'philippeheitzmann.jpeg' ),
 20 |     sidebarMenu(
 21 |       menuItem("Research Questions & Literature", tabName = 'research1', icon = icon('question'),  badgeLabel = "pt1", badgeColor = "teal"),
 22 |       menuItem("Full Text Article Scraping", icon = icon("window-maximize"), tabName = "proj1",  
 23 |                badgeLabel = "pt1", badgeColor = "teal"),
 24 |       menuItem("WSJ Data - EDA", icon = icon("window-maximize"), tabName = "wsj",  
 25 |                badgeLabel = "pt2", badgeColor = "light-blue"),
 26 |       menuItem("Data Cleaning & NLP", icon = icon("window-maximize"), tabName = "wsj2",  
 27 |                badgeLabel = "pt2", badgeColor = "light-blue"),
 28 |       menuItem("TextBlob NLP Analysis ", icon = icon("window-restore"), tabName = "proj2",
 29 |                badgeLabel = "pt3", badgeColor = "purple"),
 30 |       menuItem("VADER NLP Analysis", icon = icon("window-maximize"), tabName = "wsj4",  
 31 |                badgeLabel = "pt3", badgeColor = "purple"),
 32 |       menuItem("Regression Analysis: Comments", icon = icon("window-maximize"), tabName = "wsj5",  
 33 |                badgeLabel = "pt4", badgeColor = "blue"),
 34 |       menuItem("Regression Analysis: S&P 500", icon = icon("window-maximize"), tabName = "wsj3",  
 35 |                badgeLabel = "pt4", badgeColor = "blue"))
 36 |   ),
 37 |   dashboardBody(
 38 |     shinyDashboardThemes(theme = "purple_gradient"),
 39 |     tabItems(
 40 |       tabItem(tabName = 'research1',
 41 |               fluidRow(box(title = "Research Questions", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 42 |                            background = 'light-blue', htmlOutput('rq1'))),
 43 |               fluidRow(box(title = "Theory", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 44 |                            background = 'light-blue', textOutput('rq2'))),
 45 |               fluidRow(box(title = "Hypothesis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 46 |                            background = 'light-blue', htmlOutput('rq3'))),
 47 |               column(4, box(title = "Aman, 2013", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12,
 48 |                             background = 'teal', img(src = 'Aman 2013.png', height = 150, width = 290,textOutput('rq4')))),
 49 |               column(4, box(title = "Strycharz et al., 2018", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12,
 50 |                             background = 'teal', img(src = 'Strycharz et al., 2018.png', height = 150, width = 290, textOutput('rq5')))),
 51 |               column(4, box(title = "Engelberg & Parsons, 2011", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12,
 52 |                             background = 'teal', img(src = 'Engelberg & Parsons, 2011 .png', height = 150, width = 290, textOutput('rq6'))))),
 53 |       
 54 |       tabItem(tabName = 'proj1', 
 55 |               fluidRow(box(title = "Scraping Process", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 56 |                            background = 'light-blue', textOutput('rq7'))),
 57 |               column(6, box(title = "Single Day WSJ News Archive", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 58 |                             background = 'light-blue', img(src = 'WSJ Archives.png', height = 300, width = 475))),
 59 |               column(6, box(title = "WSJ News Article", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12,
 60 |                             background = 'light-blue', img(src = 'WSJ Article.png', height = 300, width = 475))),
 61 |               column(12, box(title = "Types of Variables Scraped", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12,
 62 |                              background = 'teal', (DT::dataTableOutput("table1"))))),
 63 |       tabItem(tabName = 'wsj', 
 64 |               fluidRow(box(title = "Wall Street Journal Exploratory Data Analysis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq14'))),
 65 |               
 66 |               column(6, (box(sliderInput('maxwords', '# of words', min = 50, max = 400, value = 400), width = 12))),
 67 |               
 68 |               column(6, (box(sliderInput('sections', '# of sections', min = 5, max = 15, value = 10), width = 12))),
 69 |               
 70 |               column(6, (box(plotOutput("wordcloud1"), width = 12))),
 71 |               
 72 |               column(6, (box(plotOutput("plot2"), width = 12)))),
 73 |       tabItem(tabName = 'wsj2', 
 74 |               fluidRow(box(title = "Wall Street Journal Data - Cleaning & Merging Process", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq8'))),
 75 |               column(6, box(title = 'Downloading SPX Data from Yahoo Finance', status = "primary", solidHeader = TRUE, background = 'light-blue', collapsible = TRUE, width = 12, img(src='Yahoo Finance.png', height = 240, width = 475))),
 76 |               column(6, box(title = 'Merging the dataframes', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq9'))),
 77 |               column(6, box(title = 'NLP Analysis - TextBlob', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq10'))),
 78 |               column(6, box(title = 'NLP Analysis - VADER', status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', width = 12, htmlOutput('rq11')))),
 79 |       tabItem(tabName = 'wsj3', 
 80 |               fluidRow(box(title = "S&P Regression Analysis", status = "info", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', textOutput('rq12'))),
 81 |               column(4, box(title = "TextBlob Independent Variables Only", status = "primary", solidHeader = TRUE, collapsible = TRUE, width = 12, background = 'light-blue', img(src = "WSJ Paragraph Results.png", height = 300, width = 290))),
 82 |               column(4, box(title = "VADER Independent Variables Only", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'vaderonlyspx.png', height = 300, width = 290), width = 12)),
 83 |               column(4, box(title = "All Sentiment Analysis Variables", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'allvariablesspx.png', height = 300, width = 290), width = 12))),
 84 |       tabItem(tabName = 'proj2', 
 85 |               column(6, box(title = "TextBlob Polarity Scores", status = "info", solidHeader = TRUE, collapsible = TRUE, textOutput('rq15'), background = 'light-blue', width = 12)),
 86 |               column(6, box(title = "TextBlob Subjectivity Scores", status = "info", solidHeader = TRUE, collapsible = TRUE, textOutput('rq16'), background = 'light-blue', width = 12)),
 87 |               column(6, box(plotOutput('plot3'), width = 12)),
 88 |               column(6, box(plotOutput('plot4'), width = 12))),
 89 |       tabItem(tabName = 'wsj5', 
 90 |               column(6, box(title = "All NLP Variables as Independent Variables", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', htmlOutput('rq21'), width = 12)),
 91 |               column(6, box(title = "VADER Negativity only as Independent Variable", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', htmlOutput('rq22'), width = 12)),
 92 |               column(6, box(title = "All NLP Variables as Independent Variables", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'allvars.png', height = 400, width = 475), width = 12)),
 93 |               column(6, box(title = "VADER Negativity only as Independent Variable", status = "primary", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', img(src = 'negativeonly.png', height = 400, width = 475), width = 12))),
 94 |       tabItem(tabName = 'wsj4', 
 95 |               column(6, box(title = "VADER Positivity Score", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', textOutput('rq19'), width = 12)),
 96 |               column(6, box(title = "VADER Negativity Score", status = "info", solidHeader = TRUE, collapsible = TRUE, background = 'light-blue', textOutput('rq20'), width = 12)),
 97 |               column(6, box(plotOutput('plot5'), width = 12)),
 98 |               column(6, box(plotOutput('plot6'), width = 12)))
 99 |     ))
100 | ))
101 | 
102 | 


--------------------------------------------------------------------------------
/app/server.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | library(dplyr)
 3 | library(ggplot2)
 4 | library(wordcloud)
 5 | library(tidyverse)
 6 | library(tidytext)
 7 | library(shinythemes)
 8 | library(shiny)
 9 | 
10 | shinyServer(function(input, output){
11 |   output$rq1 <- renderUI({
12 |     HTML("Research Questions: <br> <br> 1) Number of Comments <br> Can a statistically significant, causal relationship be demonstrated between a Wall Street Journal (\"WSJ\") article's degree of subjectivity/objectivity and positivity/negativity in its writing (as defined by widely used sentiment analysis libraries), and the number of online comments posted by readers for that article? <br> <br> 2) S&P 500 <br> Can a statistically significant causal relationship be demonstrated between the WSJ's coverage of financial news, specifically WSJ articles’ degree of subjectivity/objectivity and positivity/negativity polarity on a given day t as a whole, and stock price movements from the S&P 500 Index on that same day t? ")
13 |   })
14 |   output$rq2 <- renderText({
15 |     paste0("Theory: Information dissemination is key component of markets efficiency Efficient Markets Hypothesis states share prices should reflect all relevant information, and media outlets like the WSJ play a key in disseminating information.")
16 |   })
17 |   output$rq3 <- renderUI({
18 |     HTML("Hypothesis #1: I would expect articles that score higher on the subjectivity and polarity indices to be more likely to be shared widely as I would expect those more biased & emotional articles to engender a greater response and to be shared more widely amongst groups that share similar political opinions and views in a sort of \"echo chamber\" effect, therefore having a higher likelihood of being read and commented on. <br> <br> Hypothesis #2: Given the WSJ’s wide readership in the financial world, its wide coverage of financial news and previous findings in the literature indicating a weak relationship between frequency, objectivity and emotionality of media coverage and financial markets performance, I would expect some sort of weak relationship between WSJ news coverage and stock market performance.")
19 |   })
20 |   output$rq4 <- renderText({
21 |     paste0("Our evidence clearly indicates that crash frequency increases with media coverage and its seasonal concentration. This key finding supports the notion that intensive media reports on a firm provoke extremely large reactions in the market to corporate news.")
22 |   })
23 |   output$rq5 <- renderText({
24 |     paste0("A positive relation of the amount of coverage and emotionality with the fluctuation of stock prices was detected for Shell and Philips. In addition, corporate topics were found to positively Granger cause stock price fluctuation, particularly for Philips. The study advances past research in showing that the prediction of stock price fluctuation based on media coverage can be improved by including sentiment, emotionality, and corporate topics.")
25 |   })
26 |   output$rq6 <- renderText({
27 |     paste0("For all earnings announcements of S&P 500 Index firms, we find that local media coverage strongly predicts local trading, after controlling for earnings, investor, and newspaper characteristics. Moreover, local trading is strongly related to the timing of local reporting, a particular challenge to nonmedia explanations.")
28 |   })
29 |   output$rq7 <- renderText({
30 |     paste0("With the goal of collecting as much text data as possible for sentiment analysis, 22,772 full text WSJ articles published between 2019-2020 were scraped off the Wall Street Journal's 'News Archive' section.")
31 |   })
32 |   output$rq8 <- renderText({
33 |     paste0("As the end goal for our S&P regression analysis was to concatenate all paragraph text data for a given day as a single cell value linked to a single day, empty text observations did not have to be deleted, which reduced data processing time considerably. Text data of paragraphs published on the same day were concatenated using groupby and by applying a lambda join function. The resulting data was merged with a dataframe of stock prices and trading volumes for a given day for our S&P regression analysis.")
34 |   })
35 |   output$rq9 <- renderUI({
36 |     HTML("DataFrame1: Scraped dataframe of 232 unique dates with scraped WSJ paragraph text, headline text, sub-headline text, date published, author name, section name and number of comments data <br> <br> DataFrame2: A dataframe of historical SPX data of open, close, volumes from finance.yahoo.com <br> <br> Merged DataFrame: Using an inner join of the two dataframes, the resulting dataframe has 158 unique observations with article text and SPX data <br> <br> The merged dataframe necessarily has less observations due to equity markets being closed on weekends and holidays.")
37 |   })
38 |   output$rq10 <- renderUI({
39 |     HTML("TextBlob: Python API for common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation <br> <br> Polarity: float lying in the range of [-1,1] where 1 means positive statement and -1 means a negative statement <br> <br> Subjectivity: float lying in the range of [0,1] where 1 means subjective statement and 0 means an objective statement.")
40 |   })
41 |   output$rq11 <- renderUI({
42 |     HTML("Valence Aware Dictionary and sEntiment Reasoner (“VADER”): NLP sentiment analysis model available through NLTK package that outputs polarity (positive/negative) and intensity of emotion <br> <br> Negative: float in the range [0,1] representing negativity score <br> <br> Neutral: float in the range [0,1] representing neutrality score <br> <br> Positive: float in the range [0,1] representing positivity score <br> <br> Compound: Computed by normalizing the negative, neutral and positive scores. ")
43 |   })
44 |   output$rq12 <- renderText({
45 |     paste0("The Adj R^2 values of ~0.01 and high p-values shows these are poor models with low predictive power, even with the VADER neutral values and SPX Volume numbers added as extra independent variables. As expected, our results show WSJ sentiment analysis has low predictive power in relation to SPX moves, although, quite interestingly, TextBlob polarity of WSJ article is significant to the 10% level. This finding is interesting  in the context of our previous EDA of polarity vs number of comments of article comments which had a quasi-flat relationship. Perhaps articles that score higher on the polarity index fall on more volatile days in the stock market, leading to more emotionally polarized articles. There needs to be more data points in our analysis, however, to drive better, more statistically significant results, as a next step for this analysis going forward.")
46 |   })
47 |   output$rq13 <- renderText({
48 |     paste0("As a next step, I would want to increase the number of datapoints and limiting our text data to only scraping headlines of the WSJ and other publications in order to see if this approach would yield more interesting and statistically significant results...")
49 |   })
50 |   output$plot2 <- renderPlot({ 
51 |     head(sectionsdf, input$sections) %>% ggplot(., aes(reorder(SectionName, AverageComments), AverageComments)) + geom_bar(aes(fill = AverageComments), stat = 'identity') + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Sections with Highest Average Number of Comments per article") + xlab('Section Name') + ylab("Average Number of Comments per article")
52 |   })
53 |   output$rq14 <- renderText({
54 |     paste0("Interestingly, the sections with the highest average number of comments per article are the Opinion, Politics, U.S. and Influencers sections, which can all be commonly thought of as rubrics dealing with more polarizing and rhetorically biased topics that tend to generate a lot of online discussion and commentary.")
55 |   })
56 |   output$plot3 <- renderPlot({ 
57 |     wsj2 %>% ggplot(., aes(Comments. , polarity)) + geom_point(shape=18, color="blue") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Polarity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Polarity score") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue")
58 |   })
59 |   output$plot4 <- renderPlot({ 
60 |     wsj2 %>% ggplot(., aes(Comments. , subjectivity)) + geom_point(shape=18, color="blue") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Subjectivity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Subjectivity score") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue")
61 |   })
62 |   output$plot5 <- renderPlot({ 
63 |     wsj3 %>% ggplot(., aes(Comments. , positive)) + geom_point(shape=18, color="green") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Positivity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Positivity score") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue")
64 |   })
65 |   output$plot6 <- renderPlot({ 
66 |     wsj3 %>% ggplot(., aes(Comments. , negative)) + geom_point(shape=18, color="green") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("WSJ Negativity Score vs Number of Comments") + xlab('Number of Comments') + ylab("Negativity score") + geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="blue")
67 |   })
68 |   output$rq15 <- renderText({
69 |     paste0("Polarity of an article doesn't seem to have as much explanatory power in relation to the number of comments generated. Polarity scores for WSJ articles also seem to be very tightly distributed around the mean.")
70 |   })
71 |   output$rq16 <- renderText({
72 |     paste0("Subjectivity of an article seems to explain more of the variation in number of comments posted on that article. Subjectivity scores also appear to vary more widely than polarity scores and seem to have more outliers, which may have to be addressed later on.")
73 |   })
74 |   #CommentsRegressionDiscussion 
75 |   output$rq17 <- renderText({
76 |     paste0("Subjectivity of an article seems to explain more of the variation in number of comments posted on that article. Subjectivity scores also appear to vary more widely than polarity scores and seem to have more outliers, which may have to be addressed later on.")
77 |   })
78 |   output$rq18 <- renderText({
79 |     paste0("Placeholder")
80 |   })
81 |   output$rq19 <- renderText({
82 |     paste0("The relationship between positivity score and number of comments is weaker than I expected as I would have thought that more optimistic and feel-good articles would be shared more widely, while the below graph seems to show there is close to no relationship between VADER positivity score and number of comments posted on that article.")
83 |   })
84 |   output$rq20 <- renderText({
85 |     paste0("A seemingly high positive correlation between VADER negativity score and number of comments was perhaps the most interesting and surprising finding, at least at first glance, as I would have initially theorized that more pessimistic and negative articles would be read and shared less amongst groups and therefore lead to a lower number of posted comments for that article.")
86 |   })
87 |   output$rq21 <- renderUI({
88 |     HTML("As evidenced by the very low adj R^2 of 0.01 this is a poor model for explaining the variance in number of comments posted on WSJ articles. <br> We cannot reject the null hypothesis that the beta coefficiens of the polarity, subjectivity and positivity variables are zero based on the below. <br> <br> Interestingly however, as I had expected from our EDA of VADER negativity scores in pt3, negativity scores are statistically significant to the 1% level in predicting the number of comments posted on WSJ articles. ")
89 |   })
90 |   output$rq22 <- renderUI({
91 |     HTML("Unsurprisingly, a standalone linear regression model with just the negativity score as a predictor yields a model with a similarly low Adj R^2 value. <br> Although this would still be a poor model for predicting the number of comments on WSJ articles, the simplicity of this model makes it preferable to the linear model with all sentiment analysis variables included. The fact that an article scoring high on the negativity index has a positive & significant relationship to number of comments may be because articles published by the WSJ that score high on negativity may be more likely to announce events that would see a public showing of grief or support, such as for the announcement of the death of a public figure or some other tragedy that would affect large numbers of people and would therefore generate more comments on that article.")
92 |   })
93 |   
94 |   output$table1 <- DT::renderDataTable(formatStyle(datatable(variabledata), rownames=FALSE, columns = 1:3, color = 'white'))
95 |   
96 |   output$wordcloud1 <- renderPlot(wordcloud(docs, scale=c(5,0.5), max.words=input$maxwords, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2')))
97 | })
98 | 


--------------------------------------------------------------------------------