├── indic_tts_out_hindi.wav ├── indic_tts_out_hindi2.wav ├── indic_tts_out_hindib.wav ├── .gitignore ├── requirements.txt ├── enroll_try.py ├── model_loader.py ├── db_setup.py ├── README.md ├── api.py ├── local_try.py └── speaker_recognition.py /indic_tts_out_hindi.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindi.wav -------------------------------------------------------------------------------- /indic_tts_out_hindi2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindi2.wav -------------------------------------------------------------------------------- /indic_tts_out_hindib.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindib.wav -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env/ 2 | ModelOpen/ 3 | pretrained_ecapa_tdnn/ 4 | Audio_Formatted/ 5 | enrolled_speakers/ 6 | envs/ 7 | indic_parler/ 8 | enroll_try.py 9 | indic_try.py 10 | main.py 11 | proc_aud_api.py 12 | speaker_database.db 13 | temp_Bode_yaswanth_kumar_Angry.mp3 14 | test_yaswanth_neu.wav 15 | with_bow.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.103.1 2 | uvicorn==0.23.2 3 | python-multipart==0.0.6 4 | torch==2.0.1 5 | torchaudio==2.0.2 6 | numpy==1.24.3 7 | librosa==0.10.1 8 | webrtcvad==2.0.10 9 | pydub==0.25.1 10 | speechbrain==0.5.15 11 | transformers==4.31.0 12 | psycopg2-binary==2.9.7 13 | python-dotenv==1.0.0 14 | soundfile==0.12.1 15 | parler-tts==0.0.4 -------------------------------------------------------------------------------- /enroll_try.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | url = "http://localhost:8000/enroll-speaker/" 4 | files = {'audio': open(r'C:\Users\bharg\Downloads\SID&ED\Audio_Formatted\Yaswanth\Bode_yaswanth_kumar_Angry.mp3', 'rb')} 5 | params = {'name': 'Yaswanth'} # This should be sent as query parameters 6 | 7 | response = requests.post(url, files=files, params=params) 8 | print(response.json()) -------------------------------------------------------------------------------- /model_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from speechbrain.pretrained import SpeakerRecognition 4 | from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor 5 | from parler_tts import ParlerTTSForConditionalGeneration 6 | from transformers import AutoTokenizer 7 | 8 | # Set device to GPU if available 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | 11 | # Define model paths (you can modify these to use environment variables) 12 | WHISPER_MODEL_PATH = os.getenv('WHISPER_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small") 13 | EMOTION_MODEL_PATH = os.getenv('EMOTION_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\wavv2vec") 14 | TTS_MODEL_PATH = os.getenv('TTS_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\indic_parler") 15 | 16 | def get_speaker_recognition_model(): 17 | """Load and return the speaker recognition model""" 18 | return SpeakerRecognition.from_hparams( 19 | source="speechbrain/spkrec-ecapa-voxceleb", 20 | savedir="pretrained_ecapa_tdnn" 21 | ).to(device) 22 | 23 | def get_whisper_model_and_processor(): 24 | """Load and return the Whisper model and processor""" 25 | processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_PATH) 26 | model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_PATH).to(device) 27 | return model, processor 28 | 29 | def get_emotion_classifier(): 30 | """Load and return the emotion classification model""" 31 | return pipeline("audio-classification", model=EMOTION_MODEL_PATH, device=0 if torch.cuda.is_available() else -1) 32 | 33 | def get_tts_model_and_tokenizers(): 34 | """Load and return the TTS model and tokenizers""" 35 | model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_PATH).to(device) 36 | tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_PATH) 37 | description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path) 38 | 39 | return model, tokenizer, description_tokenizer -------------------------------------------------------------------------------- /db_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psycopg2 3 | from psycopg2.extras import execute_values 4 | import numpy as np 5 | from dotenv import load_dotenv 6 | 7 | # Load environment variables 8 | load_dotenv() 9 | 10 | # Database connection parameters 11 | DB_HOST = os.getenv('DB_HOST', 'localhost') 12 | DB_NAME = os.getenv('DB_NAME', 'speaker_recognition') 13 | DB_USER = os.getenv('DB_USER', 'postgres') 14 | DB_PASSWORD = os.getenv('DB_PASSWORD', 'password') 15 | DB_PORT = os.getenv('DB_PORT', '5432') 16 | 17 | def get_db_connection(): 18 | """Create and return a database connection""" 19 | return psycopg2.connect( 20 | host=DB_HOST, 21 | database=DB_NAME, 22 | user=DB_USER, 23 | password=DB_PASSWORD, 24 | port=DB_PORT 25 | ) 26 | 27 | def initialize_database(): 28 | """Create the necessary tables if they don't exist""" 29 | conn = get_db_connection() 30 | cur = conn.cursor() 31 | 32 | # Create speakers table 33 | cur.execute(''' 34 | CREATE TABLE IF NOT EXISTS speakers ( 35 | id SERIAL PRIMARY KEY, 36 | name VARCHAR(255) UNIQUE NOT NULL, 37 | embedding BYTEA NOT NULL, 38 | created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 39 | ) 40 | ''') 41 | 42 | # Create a table for logging recognition results 43 | cur.execute(''' 44 | CREATE TABLE IF NOT EXISTS recognition_logs ( 45 | id SERIAL PRIMARY KEY, 46 | audio_path VARCHAR(255), 47 | identified_speaker VARCHAR(255), 48 | confidence FLOAT, 49 | transcription TEXT, 50 | emotion VARCHAR(50), 51 | timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP 52 | ) 53 | ''') 54 | 55 | conn.commit() 56 | cur.close() 57 | conn.close() 58 | 59 | print("Database initialized successfully") 60 | 61 | def store_speaker_embedding(speaker_name, embedding): 62 | """Store a speaker embedding in the database""" 63 | try: 64 | conn = get_db_connection() 65 | cur = conn.cursor() 66 | 67 | # Convert numpy array to binary 68 | embedding_binary = embedding.tobytes() 69 | 70 | # Check if speaker already exists 71 | cur.execute("SELECT id FROM speakers WHERE name = %s", (speaker_name,)) 72 | result = cur.fetchone() 73 | 74 | if result: 75 | # Update existing speaker 76 | cur.execute( 77 | "UPDATE speakers SET embedding = %s WHERE name = %s", 78 | (psycopg2.Binary(embedding_binary), speaker_name) 79 | ) 80 | else: 81 | # Insert new speaker 82 | cur.execute( 83 | "INSERT INTO speakers (name, embedding) VALUES (%s, %s)", 84 | (speaker_name, psycopg2.Binary(embedding_binary)) 85 | ) 86 | 87 | conn.commit() 88 | cur.close() 89 | conn.close() 90 | return True 91 | except Exception as e: 92 | print(f"Error storing speaker embedding: {str(e)}") 93 | return False 94 | 95 | def get_all_speaker_embeddings(): 96 | """Retrieve all speaker embeddings from the database""" 97 | try: 98 | conn = get_db_connection() 99 | cur = conn.cursor() 100 | 101 | cur.execute("SELECT name, embedding FROM speakers") 102 | results = cur.fetchall() 103 | 104 | speaker_embeddings = {} 105 | for name, embedding_binary in results: 106 | # Convert binary back to numpy array 107 | embedding = np.frombuffer(embedding_binary, dtype=np.float32) 108 | speaker_embeddings[name] = embedding 109 | 110 | cur.close() 111 | conn.close() 112 | return speaker_embeddings 113 | except Exception as e: 114 | print(f"Error retrieving speaker embeddings: {str(e)}") 115 | return {} 116 | 117 | def log_recognition_result(audio_path, speaker, confidence, transcription, emotion): 118 | """Log recognition results to the database""" 119 | try: 120 | conn = get_db_connection() 121 | cur = conn.cursor() 122 | 123 | cur.execute( 124 | """INSERT INTO recognition_logs 125 | (audio_path, identified_speaker, confidence, transcription, emotion) 126 | VALUES (%s, %s, %s, %s, %s)""", 127 | (audio_path, speaker, confidence, transcription, emotion) 128 | ) 129 | 130 | conn.commit() 131 | cur.close() 132 | conn.close() 133 | return True 134 | except Exception as e: 135 | print(f"Error logging recognition result: {str(e)}") 136 | return False 137 | 138 | if __name__ == "__main__": 139 | initialize_database() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speaker Recognition and Audio Processing API 2 | 3 | A comprehensive system for speaker recognition, speech transcription, emotion detection, and text-to-speech conversion with a modern REST API interface. 4 | 5 | ## Features 6 | 7 | - **Speaker Recognition**: Enroll and identify speakers based on voice characteristics using ECAPA-TDNN embeddings 8 | - **Speech Transcription**: Convert speech to text using Whisper models 9 | - **Emotion Detection**: Detect emotions in speech 10 | - **Text-to-Speech**: Generate natural-sounding speech from text with Indian language support 11 | - **PostgreSQL Integration**: Store speaker embeddings and recognition logs in a PostgreSQL database 12 | - **RESTful API**: Easy-to-use API for all functionalities 13 | - **Optional Docker Support**: Containerized deployment option with Docker and Docker Compose 14 | 15 | ## Models Used 16 | 17 | This project leverages several state-of-the-art deep learning models: 18 | 19 | - **Speaker Recognition**: [speechbrain/spkrec-ecapa-voxceleb](https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb) - ECAPA-TDNN model trained on VoxCeleb 20 | - **Speech Transcription**: [openai/whisper-small](https://huggingface.co/openai/whisper-small) - Whisper automatic speech recognition model 21 | - **Emotion Detection**: [wav2vec2 model for emotion recognition](https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition)- Facebook's Wav2vec for emotion detection 22 | - **Text-to-Speech**: [Parler TTS for Indian languages](https://huggingface.co/ai4bharat/indic-parler-tts) - Indic Parler TTS for text to speech in indian accent & voices 23 | 24 | ## Architecture 25 | 26 | The project is structured into several modules: 27 | 28 | 1. **db_setup.py**: Database initialization and operations 29 | 2. **model_loader.py**: Model loading and management 30 | 3. **speaker_recognition.py**: Core recognition and processing functions 31 | 4. **api.py**: FastAPI server and API endpoints 32 | 33 | ## API Endpoints 34 | 35 | - `POST /enroll-speaker/`: Enroll a new speaker with audio samples 36 | - `POST /identify-speaker/`: Identify a speaker in an audio file 37 | - `POST /transcribe/`: Transcribe speech in an audio file 38 | - `POST /detect-emotion/`: Detect emotion in an audio file 39 | - `POST /text-to-speech/`: Convert text to speech 40 | - `POST /process-audio/`: Complete processing (identification, transcription, emotion detection) 41 | - `GET /download-tts/{filename}`: Download generated TTS files 42 | 43 | ## Setup Instructions 44 | 45 | ### Prerequisites 46 | 47 | - Python 3.9+ 48 | - PostgreSQL 49 | - CUDA-compatible GPU (recommended but not required) 50 | 51 | ### Standard Installation 52 | 53 | 1. Clone the repository: 54 | ```bash 55 | git clone https://github.com/bhargavak04/Speaker-Identification-Emotion-Detection 56 | ``` 57 | 58 | 2. Create and activate a virtual environment: 59 | ```bash 60 | python -m venv venv 61 | source venv/bin/activate # On Windows: venv\Scripts\activate 62 | ``` 63 | 64 | 3. Install dependencies: 65 | ```bash 66 | pip install -r requirements.txt 67 | ``` 68 | 69 | 4. Create a `.env` file with your configuration: 70 | ``` 71 | # Database Configuration 72 | DB_HOST=localhost 73 | DB_NAME=speaker_recognition 74 | DB_USER=postgres 75 | DB_PASSWORD=your_password 76 | DB_PORT=5432 77 | 78 | # Model Paths 79 | WHISPER_MODEL_PATH=/path/to/whisper-small 80 | EMOTION_MODEL_PATH=/path/to/emotion-model 81 | TTS_MODEL_PATH=/path/to/tts-model 82 | ``` 83 | 84 | 5. Initialize the database: 85 | ```bash 86 | python db_setup.py 87 | ``` 88 | 89 | 6. Start the API server: 90 | ```bash 91 | python api.py 92 | ``` 93 | 94 | 7. Access the API at http://localhost:8000 95 | 96 | ### Docker Installation (Optional) 97 | 98 | If you prefer containerized deployment, you can use Docker: 99 | 100 | 1. Make sure Docker and Docker Compose are installed on your system. 101 | 102 | 2. Create a `.env` file with your configuration (as above). 103 | 104 | 3. Build and start the containers: 105 | ```bash 106 | docker-compose up -d 107 | ``` 108 | 109 | 4. Access the API at http://localhost:8000 110 | 111 | ## Usage Examples 112 | 113 | ### Enrolling a Speaker 114 | 115 | ```bash 116 | curl -X POST "http://localhost:8000/enroll-speaker/" \ 117 | -F "speaker_name=John" \ 118 | -F "audio_files=@sample1.wav" \ 119 | -F "audio_files=@sample2.wav" 120 | ``` 121 | 122 | ### Identifying a Speaker 123 | 124 | ```bash 125 | curl -X POST "http://localhost:8000/identify-speaker/" \ 126 | -F "audio_file=@unknown_speaker.wav" 127 | ``` 128 | 129 | ### Processing an Audio File 130 | 131 | ```bash 132 | curl -X POST "http://localhost:8000/process-audio/" \ 133 | -F "audio_file=@sample.wav" 134 | ``` 135 | 136 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | from pathlib import Path 4 | from fastapi import FastAPI, UploadFile, File, Form, HTTPException 5 | from fastapi.responses import FileResponse, JSONResponse 6 | from fastapi.middleware.cors import CORSMiddleware 7 | from typing import List, Optional 8 | import shutil 9 | import tempfile 10 | 11 | # Import our modules 12 | from db_setup import initialize_database 13 | from speaker_recognition import ( 14 | enroll_speaker, 15 | identify_speaker, 16 | transcribe_audio, 17 | detect_emotion, 18 | tts, 19 | process_audio 20 | ) 21 | 22 | # Initialize database 23 | initialize_database() 24 | 25 | # Create uploads directory for temporary storage 26 | UPLOAD_DIR = Path("uploads") 27 | UPLOAD_DIR.mkdir(exist_ok=True) 28 | 29 | app = FastAPI(title="Speaker Recognition API", description="API for speaker recognition, transcription, and emotion detection") 30 | 31 | # Configure CORS 32 | app.add_middleware( 33 | CORSMiddleware, 34 | allow_origins=["*"], # For production, specify exact domains 35 | allow_credentials=True, 36 | allow_methods=["*"], 37 | allow_headers=["*"], 38 | ) 39 | 40 | @app.get("/") 41 | async def root(): 42 | return {"message": "Welcome to the Speaker Recognition API"} 43 | 44 | @app.post("/enroll-speaker/") 45 | async def api_enroll_speaker( 46 | speaker_name: str = Form(...), 47 | audio_files: List[UploadFile] = File(...) 48 | ): 49 | """Enroll a new speaker with one or more audio samples""" 50 | if not audio_files: 51 | raise HTTPException(status_code=400, detail="No audio files provided") 52 | 53 | # Create temporary directory for this enrollment 54 | temp_dir = UPLOAD_DIR / f"enroll_{uuid.uuid4()}" 55 | temp_dir.mkdir(exist_ok=True) 56 | 57 | try: 58 | saved_files = [] 59 | 60 | # Save uploaded files 61 | for audio_file in audio_files: 62 | file_path = temp_dir / audio_file.filename 63 | with open(file_path, "wb") as buffer: 64 | shutil.copyfileobj(audio_file.file, buffer) 65 | saved_files.append(str(file_path)) 66 | 67 | # Enroll the speaker 68 | success, message = enroll_speaker(speaker_name, saved_files) 69 | 70 | if success: 71 | return {"status": "success", "message": message} 72 | else: 73 | raise HTTPException(status_code=400, detail=message) 74 | 75 | except Exception as e: 76 | raise HTTPException(status_code=500, detail=f"Error enrolling speaker: {str(e)}") 77 | 78 | finally: 79 | # Clean up temporary files 80 | if temp_dir.exists(): 81 | shutil.rmtree(temp_dir) 82 | 83 | @app.post("/identify-speaker/") 84 | async def api_identify_speaker(audio_file: UploadFile = File(...)): 85 | """Identify the speaker in an audio file""" 86 | if not audio_file: 87 | raise HTTPException(status_code=400, detail="No audio file provided") 88 | 89 | # Save uploaded file 90 | temp_file = UPLOAD_DIR / f"identify_{uuid.uuid4()}_{audio_file.filename}" 91 | try: 92 | with open(temp_file, "wb") as buffer: 93 | shutil.copyfileobj(audio_file.file, buffer) 94 | 95 | # Identify the speaker 96 | speaker, confidence, message = identify_speaker(str(temp_file)) 97 | 98 | return { 99 | "speaker": speaker, 100 | "confidence": round(float(confidence), 2), 101 | "message": message 102 | } 103 | 104 | except Exception as e: 105 | raise HTTPException(status_code=500, detail=f"Error identifying speaker: {str(e)}") 106 | 107 | finally: 108 | # Clean up 109 | if temp_file.exists(): 110 | temp_file.unlink() 111 | 112 | @app.post("/transcribe/") 113 | async def api_transcribe(audio_file: UploadFile = File(...)): 114 | """Transcribe speech in an audio file""" 115 | if not audio_file: 116 | raise HTTPException(status_code=400, detail="No audio file provided") 117 | 118 | # Save uploaded file 119 | temp_file = UPLOAD_DIR / f"transcribe_{uuid.uuid4()}_{audio_file.filename}" 120 | try: 121 | with open(temp_file, "wb") as buffer: 122 | shutil.copyfileobj(audio_file.file, buffer) 123 | 124 | # Transcribe the audio 125 | transcription = transcribe_audio(str(temp_file)) 126 | 127 | return {"transcription": transcription} 128 | 129 | except Exception as e: 130 | raise HTTPException(status_code=500, detail=f"Error transcribing audio: {str(e)}") 131 | 132 | finally: 133 | # Clean up 134 | if temp_file.exists(): 135 | temp_file.unlink() 136 | 137 | @app.post("/detect-emotion/") 138 | async def api_detect_emotion(audio_file: UploadFile = File(...)): 139 | """Detect emotion in an audio file""" 140 | if not audio_file: 141 | raise HTTPException(status_code=400, detail="No audio file provided") 142 | 143 | # Save uploaded file 144 | temp_file = UPLOAD_DIR / f"emotion_{uuid.uuid4()}_{audio_file.filename}" 145 | try: 146 | with open(temp_file, "wb") as buffer: 147 | shutil.copyfileobj(audio_file.file, buffer) 148 | 149 | # Detect emotion 150 | emotion = detect_emotion(str(temp_file)) 151 | 152 | return {"emotion": emotion} 153 | 154 | except Exception as e: 155 | raise HTTPException(status_code=500, detail=f"Error detecting emotion: {str(e)}") 156 | 157 | finally: 158 | # Clean up 159 | if temp_file.exists(): 160 | temp_file.unlink() 161 | 162 | @app.post("/text-to-speech/") 163 | async def api_text_to_speech(text: str = Form(...)): 164 | """Convert text to speech""" 165 | try: 166 | # Generate a unique output path 167 | output_file = UPLOAD_DIR / f"tts_{uuid.uuid4()}.wav" 168 | 169 | # Generate speech 170 | output_path = tts(text, str(output_file)) 171 | 172 | if output_path: 173 | # Return the audio file 174 | return FileResponse( 175 | output_path, 176 | media_type="audio/wav", 177 | filename=os.path.basename(output_path) 178 | ) 179 | else: 180 | raise HTTPException(status_code=500, detail="Failed to generate speech") 181 | 182 | except Exception as e: 183 | raise HTTPException(status_code=500, detail=f"Error generating speech: {str(e)}") 184 | 185 | @app.post("/process-audio/") 186 | async def api_process_audio(audio_file: UploadFile = File(...)): 187 | """Process audio for speaker identification, transcription, and emotion detection""" 188 | if not audio_file: 189 | raise HTTPException(status_code=400, detail="No audio file provided") 190 | 191 | # Save uploaded file 192 | temp_file = UPLOAD_DIR / f"process_{uuid.uuid4()}_{audio_file.filename}" 193 | tts_output = None 194 | 195 | try: 196 | with open(temp_file, "wb") as buffer: 197 | shutil.copyfileobj(audio_file.file, buffer) 198 | 199 | # Process the audio 200 | results = process_audio(str(temp_file)) 201 | tts_output = results.get('tts_output_path') 202 | 203 | # If TTS output exists, modify the result to include a URL 204 | if tts_output and os.path.exists(tts_output): 205 | results['tts_output_url'] = f"/download-tts/{os.path.basename(tts_output)}" 206 | 207 | return results 208 | 209 | except Exception as e: 210 | raise HTTPException(status_code=500, detail=f"Error processing audio: {str(e)}") 211 | 212 | finally: 213 | # Clean up the input file but keep the TTS output 214 | if temp_file.exists(): 215 | temp_file.unlink() 216 | 217 | @app.get("/download-tts/{filename}") 218 | async def download_tts(filename: str): 219 | """Download a generated TTS file""" 220 | file_path = Path(tempfile.gettempdir()) / "speaker_recognition" / filename 221 | 222 | if not file_path.exists(): 223 | raise HTTPException(status_code=404, detail="File not found") 224 | 225 | return FileResponse( 226 | str(file_path), 227 | media_type="audio/wav", 228 | filename=filename 229 | ) 230 | 231 | if __name__ == "__main__": 232 | import uvicorn 233 | uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True) -------------------------------------------------------------------------------- /local_try.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import torchaudio 5 | import librosa 6 | import webrtcvad 7 | from pydub import AudioSegment 8 | from pathlib import Path 9 | from speechbrain.pretrained import SpeakerRecognition 10 | from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor 11 | from parler_tts import ParlerTTSForConditionalGeneration 12 | from transformers import AutoTokenizer 13 | import soundfile as sf 14 | 15 | # Set device to GPU if available 16 | device = torch.device("cpu") 17 | os.environ["HF_HUB_LOCAL_STRATEGY"] = "copy" 18 | 19 | 20 | print("Loading models...") 21 | spkrec = SpeakerRecognition.from_hparams( 22 | source="speechbrain/spkrec-ecapa-voxceleb", 23 | savedir="pretrained_ecapa_tdnn" 24 | ).to(device) 25 | 26 | whisper_processor = WhisperProcessor.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small") 27 | whisper_model = WhisperForConditionalGeneration.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small") 28 | 29 | emotion_classifier = pipeline("audio-classification", model=r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\wavv2vec") 30 | 31 | tts_model = ParlerTTSForConditionalGeneration.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\indic_parler").to(device) 32 | tts_tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\indic_parler") 33 | description_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) # AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) 34 | 35 | # Constants 36 | ENROLLMENT_DIR = Path("enrolled_speakers") 37 | ENROLLMENT_DIR.mkdir(exist_ok=True) 38 | EMBEDDING_DIM = 192 39 | 40 | def convert_to_wav(file_path): 41 | #Converts audio to WAV if it's not already in WAV format. 42 | if file_path.lower().endswith(".wav"): 43 | return file_path 44 | 45 | try: 46 | new_file_path = file_path.rsplit(".", 1)[0] + ".wav" 47 | audio = AudioSegment.from_file(file_path) 48 | audio = audio.set_frame_rate(16000).set_channels(1) # Ensure 16kHz mono 49 | audio.export(new_file_path, format="wav") 50 | return new_file_path 51 | except Exception as e: 52 | print(f"Error converting {file_path} to WAV: {str(e)}") 53 | return None 54 | 55 | def apply_vad(audio_path, aggressiveness=3): 56 | """Removes silence & noise using WebRTC VAD.""" 57 | try: 58 | signal, sr = librosa.load(audio_path, sr=16000, mono=True) 59 | signal = (signal * 32767).astype(np.int16) 60 | vad = webrtcvad.Vad(aggressiveness) 61 | frame_length = int(16000 * 0.03) 62 | signal = signal[:len(signal) - (len(signal) % frame_length)] 63 | frames = np.array_split(signal, len(signal) // frame_length) 64 | voiced_frames = [frame for frame in frames if vad.is_speech(frame.tobytes(), 16000)] 65 | 66 | if not voiced_frames: 67 | return None 68 | 69 | return np.concatenate(voiced_frames) 70 | except Exception as e: 71 | print(f"Error in VAD processing: {str(e)}") 72 | return None 73 | 74 | def extract_speaker_embedding(audio_path, spkrec_model=spkrec): 75 | """Extracts speaker embedding using ECAPA-TDNN model.""" 76 | try: 77 | signal, sr = torchaudio.load(audio_path) 78 | if sr != 16000: 79 | signal = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(signal) 80 | if signal.shape[0] > 1: 81 | signal = signal.mean(dim=0, keepdim=True) 82 | if signal.shape[1] < 16000: 83 | return None 84 | 85 | # Device handling 86 | spkrec_model = spkrec_model.to(device) 87 | signal = signal.to(device) 88 | 89 | with torch.no_grad(): 90 | embedding = spkrec_model.encode_batch(signal).squeeze().cpu().numpy() 91 | 92 | # Ensure consistent embedding dimension 93 | embedding = embedding[:EMBEDDING_DIM] if len(embedding) > EMBEDDING_DIM else np.pad( 94 | embedding, (0, EMBEDDING_DIM - len(embedding)), 'constant' 95 | ) 96 | 97 | return embedding 98 | except Exception as e: 99 | print(f"Error extracting embedding: {str(e)}") 100 | return None 101 | 102 | def enroll_speaker(speaker_name, audio_files): 103 | """Extract and store speaker embeddings from given audio files.""" 104 | embeddings = [] 105 | 106 | for audio_file in audio_files: 107 | wav_file = convert_to_wav(audio_file) 108 | if not wav_file: 109 | continue 110 | 111 | vad_audio = apply_vad(wav_file) 112 | if vad_audio is None: 113 | continue 114 | 115 | embedding = extract_speaker_embedding(wav_file) 116 | if embedding is not None: 117 | embeddings.append(embedding) 118 | 119 | if not embeddings: 120 | print(f"Failed to enroll {speaker_name}: No valid embeddings extracted") 121 | return False 122 | 123 | speaker_embedding = np.mean(np.array(embeddings), axis=0) 124 | np.save(ENROLLMENT_DIR / f"{speaker_name}.npy", speaker_embedding) 125 | print(f"Enrolled {speaker_name} successfully!") 126 | return True 127 | 128 | def identify_speaker(audio_file): 129 | """Identify the speaker of a given audio file using cosine similarity.""" 130 | wav_file = convert_to_wav(audio_file) 131 | if not wav_file: 132 | return "Failed to convert audio to WAV format" 133 | 134 | vad_audio = apply_vad(wav_file) 135 | if vad_audio is None: 136 | return "No valid speech detected in audio" 137 | 138 | test_embedding = extract_speaker_embedding(wav_file) 139 | if test_embedding is None: 140 | return "Failed to extract speaker embedding" 141 | 142 | enrolled_speakers = {f.stem: np.load(f) for f in ENROLLMENT_DIR.glob("*.npy")} 143 | 144 | if not enrolled_speakers: 145 | return "No enrolled speakers found. Please enroll speakers first." 146 | 147 | scores = [(name, np.dot(test_embedding, emb) / (np.linalg.norm(test_embedding) * np.linalg.norm(emb))) 148 | for name, emb in enrolled_speakers.items()] 149 | scores.sort(key=lambda x: x[1], reverse=True) 150 | 151 | best_speaker, best_score = scores[0] 152 | second_best_score = scores[1][1] if len(scores) > 1 else 0 153 | 154 | if best_score > 0.5 and (best_score - second_best_score) > 0.05: 155 | return f"{best_speaker} (confidence: {best_score:.2f})" 156 | return "Unknown Speaker" 157 | 158 | def transcribe_audio(audio_file): 159 | """Transcribe speech using OpenAI Whisper.""" 160 | try: 161 | audio_file = convert_to_wav(audio_file) 162 | audio, _ = librosa.load(audio_file, sr=16000) 163 | input_features = whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features 164 | predicted_ids = whisper_model.generate(input_features) 165 | return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] 166 | except Exception as e: 167 | return f"Error transcribing audio: {str(e)}" 168 | 169 | def detect_emotion(audio_file): 170 | """Detect emotion from the audio file.""" 171 | try: 172 | audio_file = convert_to_wav(audio_file) 173 | result = emotion_classifier(audio_file) 174 | return result[0]["label"] 175 | except Exception as e: 176 | return f"Error detecting emotion: {str(e)}" 177 | 178 | def tts(prompt): 179 | description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise." 180 | description_input_ids = description_tokenizer(description, return_tensors="pt").to(device) 181 | prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").to(device) 182 | 183 | generation = tts_model.generate(input_ids=description_input_ids.input_ids, attention_mask=description_input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask) 184 | audio_arr = generation.cpu().numpy().squeeze() 185 | sf.write("indic_tts_out_hindib.wav", audio_arr, tts_model.config.sampling_rate) 186 | return "indic_tts_out_hindib.wav created successfully" 187 | 188 | def process_audio(audio_file): 189 | """Process the audio file to identify speaker, transcribe, and detect emotion.""" 190 | speaker = identify_speaker(audio_file) 191 | transcription = transcribe_audio(audio_file) 192 | emotion = detect_emotion(audio_file) 193 | tts(prompt=transcription) 194 | 195 | print("\nResults:") 196 | print(f"Speaker: {speaker}") 197 | print(f"Transcription: {transcription}") 198 | print(f"Emotion: {emotion}") 199 | print(f"TTS: {tts(prompt=transcription)}") 200 | 201 | # Main Execution 202 | if __name__ == "__main__": 203 | audio_directory = "Audio_Formatted" 204 | print("Enrolling speakers...") 205 | for speaker in os.listdir(audio_directory): 206 | enroll_speaker(speaker, [os.path.join(audio_directory, speaker, f) for f in os.listdir(os.path.join(audio_directory, speaker))]) 207 | 208 | test_audio = r"C:\Users\bharg\Downloads\SID&ED\Audio_Formatted\Prudhvi\Prudhvi_kumar_Surprise.wav" 209 | process_audio(test_audio) 210 | -------------------------------------------------------------------------------- /speaker_recognition.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import torchaudio 5 | import librosa 6 | import webrtcvad 7 | from pydub import AudioSegment 8 | from pathlib import Path 9 | import soundfile as sf 10 | import tempfile 11 | 12 | # Import model loading functions 13 | from model_loader import ( 14 | get_speaker_recognition_model, 15 | get_whisper_model_and_processor, 16 | get_emotion_classifier, 17 | get_tts_model_and_tokenizers 18 | ) 19 | 20 | # Import database functions 21 | from db_setup import store_speaker_embedding, get_all_speaker_embeddings, log_recognition_result 22 | 23 | # Set device to GPU if available 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 25 | os.environ["HF_HUB_LOCAL_STRATEGY"] = "copy" 26 | 27 | # Load models 28 | print("Loading models...") 29 | spkrec = get_speaker_recognition_model() 30 | whisper_model, whisper_processor = get_whisper_model_and_processor() 31 | emotion_classifier = get_emotion_classifier() 32 | tts_model, tts_tokenizer, description_tokenizer = get_tts_model_and_tokenizers() 33 | 34 | # Constants 35 | EMBEDDING_DIM = 192 36 | TEMP_DIR = Path(tempfile.gettempdir()) / "speaker_recognition" 37 | TEMP_DIR.mkdir(exist_ok=True) 38 | 39 | def convert_to_wav(file_path): 40 | """Converts audio to WAV if it's not already in WAV format.""" 41 | if file_path.lower().endswith(".wav"): 42 | return file_path 43 | 44 | try: 45 | file_name = Path(file_path).name 46 | new_file_path = str(TEMP_DIR / f"{file_name.rsplit('.', 1)[0]}.wav") 47 | audio = AudioSegment.from_file(file_path) 48 | audio = audio.set_frame_rate(16000).set_channels(1) # Ensure 16kHz mono 49 | audio.export(new_file_path, format="wav") 50 | return new_file_path 51 | except Exception as e: 52 | print(f"Error converting {file_path} to WAV: {str(e)}") 53 | return None 54 | 55 | def apply_vad(audio_path, aggressiveness=3): 56 | """Removes silence & noise using WebRTC VAD.""" 57 | try: 58 | signal, sr = librosa.load(audio_path, sr=16000, mono=True) 59 | signal = (signal * 32767).astype(np.int16) 60 | vad = webrtcvad.Vad(aggressiveness) 61 | frame_length = int(16000 * 0.03) 62 | signal = signal[:len(signal) - (len(signal) % frame_length)] 63 | frames = np.array_split(signal, len(signal) // frame_length) 64 | voiced_frames = [frame for frame in frames if vad.is_speech(frame.tobytes(), 16000)] 65 | 66 | if not voiced_frames: 67 | return None 68 | 69 | return np.concatenate(voiced_frames) 70 | except Exception as e: 71 | print(f"Error in VAD processing: {str(e)}") 72 | return None 73 | 74 | def extract_speaker_embedding(audio_path, spkrec_model=spkrec): 75 | """Extracts speaker embedding using ECAPA-TDNN model.""" 76 | try: 77 | signal, sr = torchaudio.load(audio_path) 78 | if sr != 16000: 79 | signal = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(signal) 80 | if signal.shape[0] > 1: 81 | signal = signal.mean(dim=0, keepdim=True) 82 | if signal.shape[1] < 16000: 83 | return None 84 | 85 | # Device handling 86 | spkrec_model = spkrec_model.to(device) 87 | signal = signal.to(device) 88 | 89 | with torch.no_grad(): 90 | embedding = spkrec_model.encode_batch(signal).squeeze().cpu().numpy() 91 | 92 | # Ensure consistent embedding dimension 93 | embedding = embedding[:EMBEDDING_DIM] if len(embedding) > EMBEDDING_DIM else np.pad( 94 | embedding, (0, EMBEDDING_DIM - len(embedding)), 'constant' 95 | ) 96 | 97 | return embedding 98 | except Exception as e: 99 | print(f"Error extracting embedding: {str(e)}") 100 | return None 101 | 102 | def enroll_speaker(speaker_name, audio_files): 103 | """Extract and store speaker embeddings from given audio files.""" 104 | embeddings = [] 105 | 106 | for audio_file in audio_files: 107 | wav_file = convert_to_wav(audio_file) 108 | if not wav_file: 109 | continue 110 | 111 | vad_audio = apply_vad(wav_file) 112 | if vad_audio is None: 113 | continue 114 | 115 | embedding = extract_speaker_embedding(wav_file) 116 | if embedding is not None: 117 | embeddings.append(embedding) 118 | 119 | if not embeddings: 120 | return False, "No valid embeddings extracted" 121 | 122 | speaker_embedding = np.mean(np.array(embeddings), axis=0) 123 | 124 | # Store in PostgreSQL database 125 | success = store_speaker_embedding(speaker_name, speaker_embedding) 126 | if success: 127 | return True, f"Enrolled {speaker_name} successfully!" 128 | return False, f"Failed to enroll {speaker_name} in database" 129 | 130 | def identify_speaker(audio_file): 131 | """Identify the speaker of a given audio file using cosine similarity.""" 132 | wav_file = convert_to_wav(audio_file) 133 | if not wav_file: 134 | return "Unknown Speaker", 0.0, "Failed to convert audio to WAV format" 135 | 136 | vad_audio = apply_vad(wav_file) 137 | if vad_audio is None: 138 | return "Unknown Speaker", 0.0, "No valid speech detected in audio" 139 | 140 | test_embedding = extract_speaker_embedding(wav_file) 141 | if test_embedding is None: 142 | return "Unknown Speaker", 0.0, "Failed to extract speaker embedding" 143 | 144 | # Get all enrolled speakers from database 145 | enrolled_speakers = get_all_speaker_embeddings() 146 | 147 | if not enrolled_speakers: 148 | return "Unknown Speaker", 0.0, "No enrolled speakers found. Please enroll speakers first." 149 | 150 | scores = [(name, np.dot(test_embedding, emb) / (np.linalg.norm(test_embedding) * np.linalg.norm(emb))) 151 | for name, emb in enrolled_speakers.items()] 152 | scores.sort(key=lambda x: x[1], reverse=True) 153 | 154 | best_speaker, best_score = scores[0] 155 | second_best_score = scores[1][1] if len(scores) > 1 else 0 156 | 157 | if best_score > 0.5 and (best_score - second_best_score) > 0.05: 158 | return best_speaker, best_score, "Speaker identified successfully" 159 | return "Unknown Speaker", best_score, "Speaker confidence too low" 160 | 161 | def transcribe_audio(audio_file): 162 | """Transcribe speech using Whisper.""" 163 | try: 164 | audio_file = convert_to_wav(audio_file) 165 | audio, _ = librosa.load(audio_file, sr=16000) 166 | input_features = whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features 167 | 168 | # Move to device 169 | input_features = input_features.to(device) 170 | whisper_model.to(device) 171 | 172 | predicted_ids = whisper_model.generate(input_features) 173 | return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] 174 | except Exception as e: 175 | return f"Error transcribing audio: {str(e)}" 176 | 177 | def detect_emotion(audio_file): 178 | """Detect emotion from the audio file.""" 179 | try: 180 | audio_file = convert_to_wav(audio_file) 181 | result = emotion_classifier(audio_file) 182 | return result[0]["label"] 183 | except Exception as e: 184 | return f"Error detecting emotion: {str(e)}" 185 | 186 | def tts(prompt, output_path=None): 187 | """Text to speech conversion.""" 188 | try: 189 | description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise." 190 | description_input_ids = description_tokenizer(description, return_tensors="pt").to(device) 191 | prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").to(device) 192 | 193 | generation = tts_model.generate( 194 | input_ids=description_input_ids.input_ids, 195 | attention_mask=description_input_ids.attention_mask, 196 | prompt_input_ids=prompt_input_ids.input_ids, 197 | prompt_attention_mask=prompt_input_ids.attention_mask 198 | ) 199 | 200 | audio_arr = generation.cpu().numpy().squeeze() 201 | 202 | if output_path is None: 203 | output_path = str(TEMP_DIR / "indic_tts_output.wav") 204 | 205 | sf.write(output_path, audio_arr, tts_model.config.sampling_rate) 206 | return output_path 207 | except Exception as e: 208 | print(f"Error in TTS: {str(e)}") 209 | return None 210 | 211 | def process_audio(audio_file): 212 | """Process the audio file to identify speaker, transcribe, and detect emotion.""" 213 | speaker, confidence, message = identify_speaker(audio_file) 214 | transcription = transcribe_audio(audio_file) 215 | emotion = detect_emotion(audio_file) 216 | tts_output_path = tts(prompt=transcription) 217 | 218 | # Log results to database 219 | log_recognition_result(audio_file, speaker, confidence, transcription, emotion) 220 | 221 | results = { 222 | "speaker": speaker, 223 | "confidence": round(float(confidence), 2) if isinstance(confidence, (float, int)) else 0.0, 224 | "transcription": transcription, 225 | "emotion": emotion, 226 | "tts_output_path": tts_output_path, 227 | "message": message 228 | } 229 | 230 | return results --------------------------------------------------------------------------------