├── README.md
└── file1


/README.md:
--------------------------------------------------------------------------------
 1 | ESG-Risk-Analysis-with-NLP is a machine learning project that leverages Natural Language Processing (NLP) to identify, classify, and quantify Environmental, Social, and Governance (ESG) risks from text-based data sources such as news articles, corporate reports, and social media. The system provides insights into potential ESG threats affecting companies and industries.
 2 | 
 3 | Features
 4 | 
 5 | 📊 Text Data Processing: Cleans and preprocesses ESG-related documents.
 6 | 
 7 | 🧠 NLP-based Classification: Detects ESG risk categories using sentiment and keyword analysis.
 8 | 
 9 | ⚙️ Synthetic Data Generation: Uses synthetic datasets for training and testing.
10 | 
11 | 📈 Risk Scoring: Generates ESG risk scores and visualizations for better interpretation.
12 | 
13 | 📤 Excel Export: Outputs analyzed data to Excel for reporting or further analysis.
14 | 
15 | Tech Stack
16 | 
17 | Language: Python
18 | 
19 | Libraries: Pandas, NumPy, Scikit-learn, NLTK, SpaCy, Matplotlib, OpenPyXL
20 | 
21 | Output Format: Excel (.xlsx)
22 | 
23 | How It Works
24 | 
25 | Generate or import ESG-related text data.
26 | 
27 | Preprocess data using NLP techniques (tokenization, stopword removal, lemmatization).
28 | 
29 | Classify text into ESG categories.
30 | 
31 | Compute risk levels and visualize results.
32 | 
33 | Export final data into Excel for decision support.
34 | 
35 | Use Cases
36 | 
37 | ESG performance monitoring for corporations.
38 | 
39 | Risk evaluation in investment and portfolio management.
40 | 
41 | Automated ESG compliance reporting.
42 | 
43 | Sustainability and impact assessment research.
44 | 


--------------------------------------------------------------------------------
/file1:
--------------------------------------------------------------------------------
  1 | """
  2 | ESG-Risk-Analysis-with-NLP
  3 | ---------------------------------
  4 | Single-file demonstration project that:
  5 | - Generates synthetic ESG-related text data for companies
  6 | - Preprocesses text (basic cleaning, tokenization via TF-IDF)
  7 | - Trains a simple classifier (Logistic Regression) to predict ESG risk labels
  8 | - Produces a continuous risk score (0-100) from model probabilities
  9 | - Exports the synthetic dataset, features, predictions, and a short model report to an Excel workbook
 10 | 
 11 | How to run:
 12 | 1) Create a python environment, then install dependencies:
 13 |    pip install -r requirements.txt
 14 |    where requirements.txt contains:
 15 |        pandas
 16 |        numpy
 17 |        scikit-learn
 18 |        openpyxl
 19 |        joblib
 20 | 
 21 | 2) Run the script:
 22 |    python ESG-Risk-Analysis-with-NLP.py
 23 | 
 24 | Outputs:
 25 | - esg_synthetic_dataset.xlsx (Excel workbook with multiple sheets)
 26 | - trained_model.joblib (serialized sklearn pipeline)
 27 | 
 28 | Notes:
 29 | - This is intentionally simple and meant as a starting point for experimentation.
 30 | - Replace the synthetic data generator with real text sources (reports, news, filings) for production.
 31 | 
 32 | """
 33 | 
 34 | import random
 35 | import string
 36 | from pathlib import Path
 37 | import numpy as np
 38 | import pandas as pd
 39 | from sklearn.model_selection import train_test_split
 40 | from sklearn.feature_extraction.text import TfidfVectorizer
 41 | from sklearn.linear_model import LogisticRegression
 42 | from sklearn.pipeline import Pipeline
 43 | from sklearn.metrics import classification_report, roc_auc_score
 44 | import joblib
 45 | 
 46 | OUTPUT_DIR = Path("./output")
 47 | OUTPUT_DIR.mkdir(exist_ok=True)
 48 | 
 49 | RANDOM_SEED = 42
 50 | random.seed(RANDOM_SEED)
 51 | np.random.seed(RANDOM_SEED)
 52 | 
 53 | # ------------------------------
 54 | # Synthetic data generator
 55 | # ------------------------------
 56 | 
 57 | COMPANIES = [
 58 |     "GreenNova Energy", "AquaPure Foods", "UrbanMotion Logistics", "Skyline Builders",
 59 |     "TerraChem Materials", "SunWave Tech", "Harvest AgriCo", "BlueHarbor Shipping",
 60 |     "ClearView Finance", "Pioneer Pharma"
 61 | ]
 62 | SECTORS = ["Energy", "Food", "Logistics", "Construction", "Materials", "Tech", "Agriculture", "Shipping", "Finance", "Healthcare"]
 63 | 
 64 | ESG_NEGATIVE_KEYWORDS = [
 65 |     "spill", "fine", "lawsuit", "emission", "violation", "accident", "recall", "pollution", "bribery", "fraud"
 66 | ]
 67 | ESG_POSITIVE_KEYWORDS = [
 68 |     "renewable", "recycling", "diversity", "sustainability", "offset", "green", "compliance", "community", "safety", "audit"
 69 | ]
 70 | 
 71 | 
 72 | def random_text_with_keywords(pos=0.2, neg=0.2, length=40):
 73 |     """Create a pseudo-report paragraph that mixes neutral tokens with pos/neg keywords."""
 74 |     words = []
 75 |     for i in range(length):
 76 |         r = random.random()
 77 |         if r < neg * 0.4:
 78 |             words.append(random.choice(ESG_NEGATIVE_KEYWORDS))
 79 |         elif r < neg:
 80 |             # negative-themed phrase
 81 |             words.append(random.choice(["environmental", "non-compliant", "incident"]))
 82 |         elif r < neg + pos * 0.4:
 83 |             words.append(random.choice(ESG_POSITIVE_KEYWORDS))
 84 |         elif r < neg + pos:
 85 |             words.append(random.choice(["initiative", "policy", "target"]))
 86 |         else:
 87 |             # neutral filler
 88 |             words.append(''.join(random.choices(string.ascii_lowercase, k=random.randint(4,9))))
 89 |     # make it readable
 90 |     return ' '.join(words).capitalize() + '.'
 91 | 
 92 | 
 93 | def synthesize_dataset(n=500):
 94 |     rows = []
 95 |     for i in range(n):
 96 |         idx = i + 1
 97 |         comp_idx = i % len(COMPANIES)
 98 |         company = COMPANIES[comp_idx]
 99 |         sector = SECTORS[comp_idx]
100 | 
101 |         # bias: certain sectors might have more negative events in our synthetic world
102 |         sector_risk_bias = {
103 |             "Energy": 0.35,
104 |             "Shipping": 0.30,
105 |             "Materials": 0.28,
106 |             "Construction": 0.25,
107 |             "Finance": 0.10,
108 |         }
109 |         base_neg = sector_risk_bias.get(sector, 0.12)
110 |         # random chance of negative/positive language
111 |         neg_prop = min(max(np.random.normal(loc=base_neg, scale=0.08), 0.0), 0.9)
112 |         pos_prop = min(max(np.random.beta(2, 8), 0.0), 0.9)
113 | 
114 |         text = random_text_with_keywords(pos=pos_prop, neg=neg_prop, length=random.randint(30,70))
115 | 
116 |         # synthetic continuous esg_score (lower means better) then convert to label
117 |         # We'll create a base score influenced by negative keyword proportion
118 |         num_neg = sum(1 for kw in ESG_NEGATIVE_KEYWORDS if kw in text)
119 |         num_pos = sum(1 for kw in ESG_POSITIVE_KEYWORDS if kw in text)
120 | 
121 |         raw_score = max(0.0, 1.0 + num_neg * 0.6 - num_pos * 0.5 + np.random.normal(0, 0.8))
122 |         # transform to 0-100 where higher means more risk
123 |         esg_score = float(np.clip((raw_score - raw_score.min()) if hasattr(raw_score, 'min') else raw_score, 0, None))
124 |         # simple scaling
125 |         esg_score = float(np.clip((raw_score + 2) * 10, 0, 100))
126 | 
127 |         # label thresholds (Low, Medium, High risk)
128 |         if esg_score < 30:
129 |             risk_label = 'Low'
130 |         elif esg_score < 60:
131 |             risk_label = 'Medium'
132 |         else:
133 |             risk_label = 'High'
134 | 
135 |         rows.append({
136 |             'company_id': idx,
137 |             'company': company,
138 |             'sector': sector,
139 |             'report_text': text,
140 |             'esg_score_synthetic': round(esg_score, 2),
141 |             'risk_label': risk_label
142 |         })
143 | 
144 |     return pd.DataFrame(rows)
145 | 
146 | 
147 | # ------------------------------
148 | # Modeling pipeline
149 | # ------------------------------
150 | 
151 | 
152 | def train_esg_risk_model(df: pd.DataFrame):
153 |     # simple text-to-risk classifier
154 |     X = df['report_text']
155 |     y = df['risk_label']
156 | 
157 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)
158 | 
159 |     pipeline = Pipeline([
160 |         ('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1,2), stop_words='english')),
161 |         ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED))
162 |     ])
163 | 
164 |     pipeline.fit(X_train, y_train)
165 | 
166 |     # predictions and evaluation
167 |     y_pred = pipeline.predict(X_test)
168 |     y_proba = pipeline.predict_proba(X_test)
169 | 
170 |     report = classification_report(y_test, y_pred, output_dict=True)
171 |     try:
172 |         # AUC macro using one-vs-rest on label binarized vector
173 |         from sklearn.preprocessing import LabelBinarizer
174 |         lb = LabelBinarizer()
175 |         y_test_bin = lb.fit_transform(y_test)
176 |         y_proba_for_auc = y_proba
177 |         auc = roc_auc_score(y_test_bin, y_proba_for_auc, average='macro', multi_class='ovr')
178 |     except Exception:
179 |         auc = None
180 | 
181 |     metrics = {
182 |         'classification_report': report,
183 |         'roc_auc_macro': auc
184 |     }
185 | 
186 |     return pipeline, X_test.index, y_test, y_pred, y_proba, metrics
187 | 
188 | 
189 | # ------------------------------
190 | # Postprocessing and export
191 | # ------------------------------
192 | 
193 | 
194 | def generate_predictions_df(df: pd.DataFrame, pipeline):
195 |     proba = pipeline.predict_proba(df['report_text'])
196 |     classes = list(pipeline.classes_)
197 | 
198 |     # convert to DataFrame of class probabilities
199 |     proba_df = pd.DataFrame(proba, columns=[f'proba_{c}' for c in classes])
200 | 
201 |     # compute continuous risk score from probability of 'High' plus weighted average
202 |     # If 'High' isn't present as a class, fallback to max probability
203 |     if 'High' in classes:
204 |         high_idx = classes.index('High')
205 |         risk_score_cont = proba[:, high_idx] * 100
206 |     else:
207 |         risk_score_cont = proba.max(axis=1) * 100
208 | 
209 |     out = pd.concat([df.reset_index(drop=True), proba_df], axis=1)
210 |     out['predicted_risk_label'] = pipeline.predict(df['report_text'])
211 |     out['predicted_risk_score'] = np.round(risk_score_cont, 2)
212 |     return out
213 | 
214 | 
215 | # ------------------------------
216 | # Main runner
217 | # ------------------------------
218 | 
219 | 
220 | def main():
221 |     print("Generating synthetic dataset...")
222 |     df = synthesize_dataset(n=800)
223 |     print(f"Dataset shape: {df.shape}")
224 | 
225 |     print("Training model...")
226 |     pipeline, test_idx, y_test, y_pred, y_proba, metrics = train_esg_risk_model(df)
227 | 
228 |     print("Saving trained model...")
229 |     model_path = OUTPUT_DIR / 'trained_model.joblib'
230 |     joblib.dump(pipeline, model_path)
231 | 
232 |     print("Generating predictions on full dataset...")
233 |     results = generate_predictions_df(df, pipeline)
234 | 
235 |     # produce a short evaluation table
236 |     eval_df = pd.DataFrame(metrics['classification_report']).transpose()
237 |     eval_summary = pd.DataFrame([{'roc_auc_macro': metrics['roc_auc_macro']}])
238 | 
239 |     # Export to Excel with multiple sheets
240 |     excel_path = OUTPUT_DIR / 'esg_synthetic_dataset.xlsx'
241 |     print(f"Exporting results to {excel_path} ...")
242 |     with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
243 |         df.to_excel(writer, sheet_name='synthetic_raw', index=False)
244 |         results.to_excel(writer, sheet_name='predictions', index=False)
245 |         eval_df.to_excel(writer, sheet_name='classification_report')
246 |         eval_summary.to_excel(writer, sheet_name='eval_summary', index=False)
247 | 
248 |     print("Done. Files written:")
249 |     print(f" - {excel_path}")
250 |     print(f" - {model_path}")
251 | 
252 | 
253 | if __name__ == '__main__':
254 |     main()
255 | 


--------------------------------------------------------------------------------