├── README.md └── file1 /README.md: -------------------------------------------------------------------------------- 1 | ESG-Risk-Analysis-with-NLP is a machine learning project that leverages Natural Language Processing (NLP) to identify, classify, and quantify Environmental, Social, and Governance (ESG) risks from text-based data sources such as news articles, corporate reports, and social media. The system provides insights into potential ESG threats affecting companies and industries. 2 | 3 | Features 4 | 5 | 📊 Text Data Processing: Cleans and preprocesses ESG-related documents. 6 | 7 | 🧠 NLP-based Classification: Detects ESG risk categories using sentiment and keyword analysis. 8 | 9 | ⚙️ Synthetic Data Generation: Uses synthetic datasets for training and testing. 10 | 11 | 📈 Risk Scoring: Generates ESG risk scores and visualizations for better interpretation. 12 | 13 | 📤 Excel Export: Outputs analyzed data to Excel for reporting or further analysis. 14 | 15 | Tech Stack 16 | 17 | Language: Python 18 | 19 | Libraries: Pandas, NumPy, Scikit-learn, NLTK, SpaCy, Matplotlib, OpenPyXL 20 | 21 | Output Format: Excel (.xlsx) 22 | 23 | How It Works 24 | 25 | Generate or import ESG-related text data. 26 | 27 | Preprocess data using NLP techniques (tokenization, stopword removal, lemmatization). 28 | 29 | Classify text into ESG categories. 30 | 31 | Compute risk levels and visualize results. 32 | 33 | Export final data into Excel for decision support. 34 | 35 | Use Cases 36 | 37 | ESG performance monitoring for corporations. 38 | 39 | Risk evaluation in investment and portfolio management. 40 | 41 | Automated ESG compliance reporting. 42 | 43 | Sustainability and impact assessment research. 44 | -------------------------------------------------------------------------------- /file1: -------------------------------------------------------------------------------- 1 | """ 2 | ESG-Risk-Analysis-with-NLP 3 | --------------------------------- 4 | Single-file demonstration project that: 5 | - Generates synthetic ESG-related text data for companies 6 | - Preprocesses text (basic cleaning, tokenization via TF-IDF) 7 | - Trains a simple classifier (Logistic Regression) to predict ESG risk labels 8 | - Produces a continuous risk score (0-100) from model probabilities 9 | - Exports the synthetic dataset, features, predictions, and a short model report to an Excel workbook 10 | 11 | How to run: 12 | 1) Create a python environment, then install dependencies: 13 | pip install -r requirements.txt 14 | where requirements.txt contains: 15 | pandas 16 | numpy 17 | scikit-learn 18 | openpyxl 19 | joblib 20 | 21 | 2) Run the script: 22 | python ESG-Risk-Analysis-with-NLP.py 23 | 24 | Outputs: 25 | - esg_synthetic_dataset.xlsx (Excel workbook with multiple sheets) 26 | - trained_model.joblib (serialized sklearn pipeline) 27 | 28 | Notes: 29 | - This is intentionally simple and meant as a starting point for experimentation. 30 | - Replace the synthetic data generator with real text sources (reports, news, filings) for production. 31 | 32 | """ 33 | 34 | import random 35 | import string 36 | from pathlib import Path 37 | import numpy as np 38 | import pandas as pd 39 | from sklearn.model_selection import train_test_split 40 | from sklearn.feature_extraction.text import TfidfVectorizer 41 | from sklearn.linear_model import LogisticRegression 42 | from sklearn.pipeline import Pipeline 43 | from sklearn.metrics import classification_report, roc_auc_score 44 | import joblib 45 | 46 | OUTPUT_DIR = Path("./output") 47 | OUTPUT_DIR.mkdir(exist_ok=True) 48 | 49 | RANDOM_SEED = 42 50 | random.seed(RANDOM_SEED) 51 | np.random.seed(RANDOM_SEED) 52 | 53 | # ------------------------------ 54 | # Synthetic data generator 55 | # ------------------------------ 56 | 57 | COMPANIES = [ 58 | "GreenNova Energy", "AquaPure Foods", "UrbanMotion Logistics", "Skyline Builders", 59 | "TerraChem Materials", "SunWave Tech", "Harvest AgriCo", "BlueHarbor Shipping", 60 | "ClearView Finance", "Pioneer Pharma" 61 | ] 62 | SECTORS = ["Energy", "Food", "Logistics", "Construction", "Materials", "Tech", "Agriculture", "Shipping", "Finance", "Healthcare"] 63 | 64 | ESG_NEGATIVE_KEYWORDS = [ 65 | "spill", "fine", "lawsuit", "emission", "violation", "accident", "recall", "pollution", "bribery", "fraud" 66 | ] 67 | ESG_POSITIVE_KEYWORDS = [ 68 | "renewable", "recycling", "diversity", "sustainability", "offset", "green", "compliance", "community", "safety", "audit" 69 | ] 70 | 71 | 72 | def random_text_with_keywords(pos=0.2, neg=0.2, length=40): 73 | """Create a pseudo-report paragraph that mixes neutral tokens with pos/neg keywords.""" 74 | words = [] 75 | for i in range(length): 76 | r = random.random() 77 | if r < neg * 0.4: 78 | words.append(random.choice(ESG_NEGATIVE_KEYWORDS)) 79 | elif r < neg: 80 | # negative-themed phrase 81 | words.append(random.choice(["environmental", "non-compliant", "incident"])) 82 | elif r < neg + pos * 0.4: 83 | words.append(random.choice(ESG_POSITIVE_KEYWORDS)) 84 | elif r < neg + pos: 85 | words.append(random.choice(["initiative", "policy", "target"])) 86 | else: 87 | # neutral filler 88 | words.append(''.join(random.choices(string.ascii_lowercase, k=random.randint(4,9)))) 89 | # make it readable 90 | return ' '.join(words).capitalize() + '.' 91 | 92 | 93 | def synthesize_dataset(n=500): 94 | rows = [] 95 | for i in range(n): 96 | idx = i + 1 97 | comp_idx = i % len(COMPANIES) 98 | company = COMPANIES[comp_idx] 99 | sector = SECTORS[comp_idx] 100 | 101 | # bias: certain sectors might have more negative events in our synthetic world 102 | sector_risk_bias = { 103 | "Energy": 0.35, 104 | "Shipping": 0.30, 105 | "Materials": 0.28, 106 | "Construction": 0.25, 107 | "Finance": 0.10, 108 | } 109 | base_neg = sector_risk_bias.get(sector, 0.12) 110 | # random chance of negative/positive language 111 | neg_prop = min(max(np.random.normal(loc=base_neg, scale=0.08), 0.0), 0.9) 112 | pos_prop = min(max(np.random.beta(2, 8), 0.0), 0.9) 113 | 114 | text = random_text_with_keywords(pos=pos_prop, neg=neg_prop, length=random.randint(30,70)) 115 | 116 | # synthetic continuous esg_score (lower means better) then convert to label 117 | # We'll create a base score influenced by negative keyword proportion 118 | num_neg = sum(1 for kw in ESG_NEGATIVE_KEYWORDS if kw in text) 119 | num_pos = sum(1 for kw in ESG_POSITIVE_KEYWORDS if kw in text) 120 | 121 | raw_score = max(0.0, 1.0 + num_neg * 0.6 - num_pos * 0.5 + np.random.normal(0, 0.8)) 122 | # transform to 0-100 where higher means more risk 123 | esg_score = float(np.clip((raw_score - raw_score.min()) if hasattr(raw_score, 'min') else raw_score, 0, None)) 124 | # simple scaling 125 | esg_score = float(np.clip((raw_score + 2) * 10, 0, 100)) 126 | 127 | # label thresholds (Low, Medium, High risk) 128 | if esg_score < 30: 129 | risk_label = 'Low' 130 | elif esg_score < 60: 131 | risk_label = 'Medium' 132 | else: 133 | risk_label = 'High' 134 | 135 | rows.append({ 136 | 'company_id': idx, 137 | 'company': company, 138 | 'sector': sector, 139 | 'report_text': text, 140 | 'esg_score_synthetic': round(esg_score, 2), 141 | 'risk_label': risk_label 142 | }) 143 | 144 | return pd.DataFrame(rows) 145 | 146 | 147 | # ------------------------------ 148 | # Modeling pipeline 149 | # ------------------------------ 150 | 151 | 152 | def train_esg_risk_model(df: pd.DataFrame): 153 | # simple text-to-risk classifier 154 | X = df['report_text'] 155 | y = df['risk_label'] 156 | 157 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y) 158 | 159 | pipeline = Pipeline([ 160 | ('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1,2), stop_words='english')), 161 | ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)) 162 | ]) 163 | 164 | pipeline.fit(X_train, y_train) 165 | 166 | # predictions and evaluation 167 | y_pred = pipeline.predict(X_test) 168 | y_proba = pipeline.predict_proba(X_test) 169 | 170 | report = classification_report(y_test, y_pred, output_dict=True) 171 | try: 172 | # AUC macro using one-vs-rest on label binarized vector 173 | from sklearn.preprocessing import LabelBinarizer 174 | lb = LabelBinarizer() 175 | y_test_bin = lb.fit_transform(y_test) 176 | y_proba_for_auc = y_proba 177 | auc = roc_auc_score(y_test_bin, y_proba_for_auc, average='macro', multi_class='ovr') 178 | except Exception: 179 | auc = None 180 | 181 | metrics = { 182 | 'classification_report': report, 183 | 'roc_auc_macro': auc 184 | } 185 | 186 | return pipeline, X_test.index, y_test, y_pred, y_proba, metrics 187 | 188 | 189 | # ------------------------------ 190 | # Postprocessing and export 191 | # ------------------------------ 192 | 193 | 194 | def generate_predictions_df(df: pd.DataFrame, pipeline): 195 | proba = pipeline.predict_proba(df['report_text']) 196 | classes = list(pipeline.classes_) 197 | 198 | # convert to DataFrame of class probabilities 199 | proba_df = pd.DataFrame(proba, columns=[f'proba_{c}' for c in classes]) 200 | 201 | # compute continuous risk score from probability of 'High' plus weighted average 202 | # If 'High' isn't present as a class, fallback to max probability 203 | if 'High' in classes: 204 | high_idx = classes.index('High') 205 | risk_score_cont = proba[:, high_idx] * 100 206 | else: 207 | risk_score_cont = proba.max(axis=1) * 100 208 | 209 | out = pd.concat([df.reset_index(drop=True), proba_df], axis=1) 210 | out['predicted_risk_label'] = pipeline.predict(df['report_text']) 211 | out['predicted_risk_score'] = np.round(risk_score_cont, 2) 212 | return out 213 | 214 | 215 | # ------------------------------ 216 | # Main runner 217 | # ------------------------------ 218 | 219 | 220 | def main(): 221 | print("Generating synthetic dataset...") 222 | df = synthesize_dataset(n=800) 223 | print(f"Dataset shape: {df.shape}") 224 | 225 | print("Training model...") 226 | pipeline, test_idx, y_test, y_pred, y_proba, metrics = train_esg_risk_model(df) 227 | 228 | print("Saving trained model...") 229 | model_path = OUTPUT_DIR / 'trained_model.joblib' 230 | joblib.dump(pipeline, model_path) 231 | 232 | print("Generating predictions on full dataset...") 233 | results = generate_predictions_df(df, pipeline) 234 | 235 | # produce a short evaluation table 236 | eval_df = pd.DataFrame(metrics['classification_report']).transpose() 237 | eval_summary = pd.DataFrame([{'roc_auc_macro': metrics['roc_auc_macro']}]) 238 | 239 | # Export to Excel with multiple sheets 240 | excel_path = OUTPUT_DIR / 'esg_synthetic_dataset.xlsx' 241 | print(f"Exporting results to {excel_path} ...") 242 | with pd.ExcelWriter(excel_path, engine='openpyxl') as writer: 243 | df.to_excel(writer, sheet_name='synthetic_raw', index=False) 244 | results.to_excel(writer, sheet_name='predictions', index=False) 245 | eval_df.to_excel(writer, sheet_name='classification_report') 246 | eval_summary.to_excel(writer, sheet_name='eval_summary', index=False) 247 | 248 | print("Done. Files written:") 249 | print(f" - {excel_path}") 250 | print(f" - {model_path}") 251 | 252 | 253 | if __name__ == '__main__': 254 | main() 255 | --------------------------------------------------------------------------------