├── indic_tts_out_hindi.wav
├── indic_tts_out_hindi2.wav
├── indic_tts_out_hindib.wav
├── .gitignore
├── requirements.txt
├── enroll_try.py
├── model_loader.py
├── db_setup.py
├── README.md
├── api.py
├── local_try.py
└── speaker_recognition.py


/indic_tts_out_hindi.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindi.wav


--------------------------------------------------------------------------------
/indic_tts_out_hindi2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindi2.wav


--------------------------------------------------------------------------------
/indic_tts_out_hindib.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhargavak04/Speaker-Identification-Emotion-Detection/HEAD/indic_tts_out_hindib.wav


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | env/
 2 | ModelOpen/
 3 | pretrained_ecapa_tdnn/
 4 | Audio_Formatted/
 5 | enrolled_speakers/
 6 | envs/
 7 | indic_parler/
 8 | enroll_try.py
 9 | indic_try.py
10 | main.py
11 | proc_aud_api.py
12 | speaker_database.db
13 | temp_Bode_yaswanth_kumar_Angry.mp3
14 | test_yaswanth_neu.wav
15 | with_bow.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.103.1
 2 | uvicorn==0.23.2
 3 | python-multipart==0.0.6
 4 | torch==2.0.1
 5 | torchaudio==2.0.2
 6 | numpy==1.24.3
 7 | librosa==0.10.1
 8 | webrtcvad==2.0.10
 9 | pydub==0.25.1
10 | speechbrain==0.5.15
11 | transformers==4.31.0
12 | psycopg2-binary==2.9.7
13 | python-dotenv==1.0.0
14 | soundfile==0.12.1
15 | parler-tts==0.0.4


--------------------------------------------------------------------------------
/enroll_try.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | url = "http://localhost:8000/enroll-speaker/"
4 | files = {'audio': open(r'C:\Users\bharg\Downloads\SID&ED\Audio_Formatted\Yaswanth\Bode_yaswanth_kumar_Angry.mp3', 'rb')}
5 | params = {'name': 'Yaswanth'}  # This should be sent as query parameters
6 | 
7 | response = requests.post(url, files=files, params=params)
8 | print(response.json())


--------------------------------------------------------------------------------
/model_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from speechbrain.pretrained import SpeakerRecognition
 4 | from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
 5 | from parler_tts import ParlerTTSForConditionalGeneration
 6 | from transformers import AutoTokenizer
 7 | 
 8 | # Set device to GPU if available
 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10 | 
11 | # Define model paths (you can modify these to use environment variables)
12 | WHISPER_MODEL_PATH = os.getenv('WHISPER_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small")
13 | EMOTION_MODEL_PATH = os.getenv('EMOTION_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\wavv2vec")
14 | TTS_MODEL_PATH = os.getenv('TTS_MODEL_PATH', r"C:\Users\bharg\Downloads\SID&ED\indic_parler")
15 | 
16 | def get_speaker_recognition_model():
17 |     """Load and return the speaker recognition model"""
18 |     return SpeakerRecognition.from_hparams(
19 |         source="speechbrain/spkrec-ecapa-voxceleb",
20 |         savedir="pretrained_ecapa_tdnn"
21 |     ).to(device)
22 | 
23 | def get_whisper_model_and_processor():
24 |     """Load and return the Whisper model and processor"""
25 |     processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_PATH)
26 |     model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_PATH).to(device)
27 |     return model, processor
28 | 
29 | def get_emotion_classifier():
30 |     """Load and return the emotion classification model"""
31 |     return pipeline("audio-classification", model=EMOTION_MODEL_PATH, device=0 if torch.cuda.is_available() else -1)
32 | 
33 | def get_tts_model_and_tokenizers():
34 |     """Load and return the TTS model and tokenizers"""
35 |     model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_PATH).to(device)
36 |     tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_PATH)
37 |     description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
38 |     
39 |     return model, tokenizer, description_tokenizer


--------------------------------------------------------------------------------
/db_setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import psycopg2
  3 | from psycopg2.extras import execute_values
  4 | import numpy as np
  5 | from dotenv import load_dotenv
  6 | 
  7 | # Load environment variables
  8 | load_dotenv()
  9 | 
 10 | # Database connection parameters
 11 | DB_HOST = os.getenv('DB_HOST', 'localhost')
 12 | DB_NAME = os.getenv('DB_NAME', 'speaker_recognition')
 13 | DB_USER = os.getenv('DB_USER', 'postgres')
 14 | DB_PASSWORD = os.getenv('DB_PASSWORD', 'password')
 15 | DB_PORT = os.getenv('DB_PORT', '5432')
 16 | 
 17 | def get_db_connection():
 18 |     """Create and return a database connection"""
 19 |     return psycopg2.connect(
 20 |         host=DB_HOST,
 21 |         database=DB_NAME,
 22 |         user=DB_USER,
 23 |         password=DB_PASSWORD,
 24 |         port=DB_PORT
 25 |     )
 26 | 
 27 | def initialize_database():
 28 |     """Create the necessary tables if they don't exist"""
 29 |     conn = get_db_connection()
 30 |     cur = conn.cursor()
 31 |     
 32 |     # Create speakers table
 33 |     cur.execute('''
 34 |     CREATE TABLE IF NOT EXISTS speakers (
 35 |         id SERIAL PRIMARY KEY,
 36 |         name VARCHAR(255) UNIQUE NOT NULL,
 37 |         embedding BYTEA NOT NULL,
 38 |         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 39 |     )
 40 |     ''')
 41 |     
 42 |     # Create a table for logging recognition results
 43 |     cur.execute('''
 44 |     CREATE TABLE IF NOT EXISTS recognition_logs (
 45 |         id SERIAL PRIMARY KEY,
 46 |         audio_path VARCHAR(255),
 47 |         identified_speaker VARCHAR(255),
 48 |         confidence FLOAT,
 49 |         transcription TEXT,
 50 |         emotion VARCHAR(50),
 51 |         timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 52 |     )
 53 |     ''')
 54 |     
 55 |     conn.commit()
 56 |     cur.close()
 57 |     conn.close()
 58 |     
 59 |     print("Database initialized successfully")
 60 | 
 61 | def store_speaker_embedding(speaker_name, embedding):
 62 |     """Store a speaker embedding in the database"""
 63 |     try:
 64 |         conn = get_db_connection()
 65 |         cur = conn.cursor()
 66 |         
 67 |         # Convert numpy array to binary
 68 |         embedding_binary = embedding.tobytes()
 69 |         
 70 |         # Check if speaker already exists
 71 |         cur.execute("SELECT id FROM speakers WHERE name = %s", (speaker_name,))
 72 |         result = cur.fetchone()
 73 |         
 74 |         if result:
 75 |             # Update existing speaker
 76 |             cur.execute(
 77 |                 "UPDATE speakers SET embedding = %s WHERE name = %s",
 78 |                 (psycopg2.Binary(embedding_binary), speaker_name)
 79 |             )
 80 |         else:
 81 |             # Insert new speaker
 82 |             cur.execute(
 83 |                 "INSERT INTO speakers (name, embedding) VALUES (%s, %s)",
 84 |                 (speaker_name, psycopg2.Binary(embedding_binary))
 85 |             )
 86 |         
 87 |         conn.commit()
 88 |         cur.close()
 89 |         conn.close()
 90 |         return True
 91 |     except Exception as e:
 92 |         print(f"Error storing speaker embedding: {str(e)}")
 93 |         return False
 94 | 
 95 | def get_all_speaker_embeddings():
 96 |     """Retrieve all speaker embeddings from the database"""
 97 |     try:
 98 |         conn = get_db_connection()
 99 |         cur = conn.cursor()
100 |         
101 |         cur.execute("SELECT name, embedding FROM speakers")
102 |         results = cur.fetchall()
103 |         
104 |         speaker_embeddings = {}
105 |         for name, embedding_binary in results:
106 |             # Convert binary back to numpy array
107 |             embedding = np.frombuffer(embedding_binary, dtype=np.float32)
108 |             speaker_embeddings[name] = embedding
109 |         
110 |         cur.close()
111 |         conn.close()
112 |         return speaker_embeddings
113 |     except Exception as e:
114 |         print(f"Error retrieving speaker embeddings: {str(e)}")
115 |         return {}
116 | 
117 | def log_recognition_result(audio_path, speaker, confidence, transcription, emotion):
118 |     """Log recognition results to the database"""
119 |     try:
120 |         conn = get_db_connection()
121 |         cur = conn.cursor()
122 |         
123 |         cur.execute(
124 |             """INSERT INTO recognition_logs 
125 |                (audio_path, identified_speaker, confidence, transcription, emotion) 
126 |                VALUES (%s, %s, %s, %s, %s)""",
127 |             (audio_path, speaker, confidence, transcription, emotion)
128 |         )
129 |         
130 |         conn.commit()
131 |         cur.close()
132 |         conn.close()
133 |         return True
134 |     except Exception as e:
135 |         print(f"Error logging recognition result: {str(e)}")
136 |         return False
137 | 
138 | if __name__ == "__main__":
139 |     initialize_database()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Speaker Recognition and Audio Processing API
  2 | 
  3 | A comprehensive system for speaker recognition, speech transcription, emotion detection, and text-to-speech conversion with a modern REST API interface.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Speaker Recognition**: Enroll and identify speakers based on voice characteristics using ECAPA-TDNN embeddings
  8 | - **Speech Transcription**: Convert speech to text using Whisper models
  9 | - **Emotion Detection**: Detect emotions in speech
 10 | - **Text-to-Speech**: Generate natural-sounding speech from text with Indian language support
 11 | - **PostgreSQL Integration**: Store speaker embeddings and recognition logs in a PostgreSQL database
 12 | - **RESTful API**: Easy-to-use API for all functionalities
 13 | - **Optional Docker Support**: Containerized deployment option with Docker and Docker Compose
 14 | 
 15 | ## Models Used
 16 | 
 17 | This project leverages several state-of-the-art deep learning models:
 18 | 
 19 | - **Speaker Recognition**: [speechbrain/spkrec-ecapa-voxceleb](https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb) - ECAPA-TDNN model trained on VoxCeleb
 20 | - **Speech Transcription**: [openai/whisper-small](https://huggingface.co/openai/whisper-small) - Whisper automatic speech recognition model
 21 | - **Emotion Detection**: [wav2vec2 model for emotion recognition](https://huggingface.co/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition)- Facebook's Wav2vec for emotion detection
 22 | - **Text-to-Speech**: [Parler TTS for Indian languages](https://huggingface.co/ai4bharat/indic-parler-tts) - Indic Parler TTS for text to speech in indian accent &  voices
 23 | 
 24 | ## Architecture
 25 | 
 26 | The project is structured into several modules:
 27 | 
 28 | 1. **db_setup.py**: Database initialization and operations
 29 | 2. **model_loader.py**: Model loading and management
 30 | 3. **speaker_recognition.py**: Core recognition and processing functions
 31 | 4. **api.py**: FastAPI server and API endpoints
 32 | 
 33 | ## API Endpoints
 34 | 
 35 | - `POST /enroll-speaker/`: Enroll a new speaker with audio samples
 36 | - `POST /identify-speaker/`: Identify a speaker in an audio file
 37 | - `POST /transcribe/`: Transcribe speech in an audio file
 38 | - `POST /detect-emotion/`: Detect emotion in an audio file
 39 | - `POST /text-to-speech/`: Convert text to speech
 40 | - `POST /process-audio/`: Complete processing (identification, transcription, emotion detection)
 41 | - `GET /download-tts/{filename}`: Download generated TTS files
 42 | 
 43 | ## Setup Instructions
 44 | 
 45 | ### Prerequisites
 46 | 
 47 | - Python 3.9+
 48 | - PostgreSQL
 49 | - CUDA-compatible GPU (recommended but not required)
 50 | 
 51 | ### Standard Installation
 52 | 
 53 | 1. Clone the repository:
 54 |    ```bash
 55 |    git clone https://github.com/bhargavak04/Speaker-Identification-Emotion-Detection
 56 |    ```
 57 | 
 58 | 2. Create and activate a virtual environment:
 59 |    ```bash
 60 |    python -m venv venv
 61 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
 62 |    ```
 63 | 
 64 | 3. Install dependencies:
 65 |    ```bash
 66 |    pip install -r requirements.txt
 67 |    ```
 68 | 
 69 | 4. Create a `.env` file with your configuration:
 70 |    ```
 71 |    # Database Configuration
 72 |    DB_HOST=localhost
 73 |    DB_NAME=speaker_recognition
 74 |    DB_USER=postgres
 75 |    DB_PASSWORD=your_password
 76 |    DB_PORT=5432
 77 | 
 78 |    # Model Paths
 79 |    WHISPER_MODEL_PATH=/path/to/whisper-small
 80 |    EMOTION_MODEL_PATH=/path/to/emotion-model
 81 |    TTS_MODEL_PATH=/path/to/tts-model
 82 |    ```
 83 | 
 84 | 5. Initialize the database:
 85 |    ```bash
 86 |    python db_setup.py
 87 |    ```
 88 | 
 89 | 6. Start the API server:
 90 |    ```bash
 91 |    python api.py
 92 |    ```
 93 | 
 94 | 7. Access the API at http://localhost:8000
 95 | 
 96 | ### Docker Installation (Optional)
 97 | 
 98 | If you prefer containerized deployment, you can use Docker:
 99 | 
100 | 1. Make sure Docker and Docker Compose are installed on your system.
101 | 
102 | 2. Create a `.env` file with your configuration (as above).
103 | 
104 | 3. Build and start the containers:
105 |    ```bash
106 |    docker-compose up -d
107 |    ```
108 | 
109 | 4. Access the API at http://localhost:8000
110 | 
111 | ## Usage Examples
112 | 
113 | ### Enrolling a Speaker
114 | 
115 | ```bash
116 | curl -X POST "http://localhost:8000/enroll-speaker/" \
117 |   -F "speaker_name=John" \
118 |   -F "audio_files=@sample1.wav" \
119 |   -F "audio_files=@sample2.wav"
120 | ```
121 | 
122 | ### Identifying a Speaker
123 | 
124 | ```bash
125 | curl -X POST "http://localhost:8000/identify-speaker/" \
126 |   -F "audio_file=@unknown_speaker.wav"
127 | ```
128 | 
129 | ### Processing an Audio File
130 | 
131 | ```bash
132 | curl -X POST "http://localhost:8000/process-audio/" \
133 |   -F "audio_file=@sample.wav"
134 | ```
135 | 
136 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | from pathlib import Path
  4 | from fastapi import FastAPI, UploadFile, File, Form, HTTPException
  5 | from fastapi.responses import FileResponse, JSONResponse
  6 | from fastapi.middleware.cors import CORSMiddleware
  7 | from typing import List, Optional
  8 | import shutil
  9 | import tempfile
 10 | 
 11 | # Import our modules
 12 | from db_setup import initialize_database
 13 | from speaker_recognition import (
 14 |     enroll_speaker, 
 15 |     identify_speaker, 
 16 |     transcribe_audio, 
 17 |     detect_emotion, 
 18 |     tts, 
 19 |     process_audio
 20 | )
 21 | 
 22 | # Initialize database
 23 | initialize_database()
 24 | 
 25 | # Create uploads directory for temporary storage
 26 | UPLOAD_DIR = Path("uploads")
 27 | UPLOAD_DIR.mkdir(exist_ok=True)
 28 | 
 29 | app = FastAPI(title="Speaker Recognition API", description="API for speaker recognition, transcription, and emotion detection")
 30 | 
 31 | # Configure CORS
 32 | app.add_middleware(
 33 |     CORSMiddleware,
 34 |     allow_origins=["*"],  # For production, specify exact domains
 35 |     allow_credentials=True,
 36 |     allow_methods=["*"],
 37 |     allow_headers=["*"],
 38 | )
 39 | 
 40 | @app.get("/")
 41 | async def root():
 42 |     return {"message": "Welcome to the Speaker Recognition API"}
 43 | 
 44 | @app.post("/enroll-speaker/")
 45 | async def api_enroll_speaker(
 46 |     speaker_name: str = Form(...),
 47 |     audio_files: List[UploadFile] = File(...)
 48 | ):
 49 |     """Enroll a new speaker with one or more audio samples"""
 50 |     if not audio_files:
 51 |         raise HTTPException(status_code=400, detail="No audio files provided")
 52 |     
 53 |     # Create temporary directory for this enrollment
 54 |     temp_dir = UPLOAD_DIR / f"enroll_{uuid.uuid4()}"
 55 |     temp_dir.mkdir(exist_ok=True)
 56 |     
 57 |     try:
 58 |         saved_files = []
 59 |         
 60 |         # Save uploaded files
 61 |         for audio_file in audio_files:
 62 |             file_path = temp_dir / audio_file.filename
 63 |             with open(file_path, "wb") as buffer:
 64 |                 shutil.copyfileobj(audio_file.file, buffer)
 65 |             saved_files.append(str(file_path))
 66 |         
 67 |         # Enroll the speaker
 68 |         success, message = enroll_speaker(speaker_name, saved_files)
 69 |         
 70 |         if success:
 71 |             return {"status": "success", "message": message}
 72 |         else:
 73 |             raise HTTPException(status_code=400, detail=message)
 74 |     
 75 |     except Exception as e:
 76 |         raise HTTPException(status_code=500, detail=f"Error enrolling speaker: {str(e)}")
 77 |     
 78 |     finally:
 79 |         # Clean up temporary files
 80 |         if temp_dir.exists():
 81 |             shutil.rmtree(temp_dir)
 82 | 
 83 | @app.post("/identify-speaker/")
 84 | async def api_identify_speaker(audio_file: UploadFile = File(...)):
 85 |     """Identify the speaker in an audio file"""
 86 |     if not audio_file:
 87 |         raise HTTPException(status_code=400, detail="No audio file provided")
 88 |     
 89 |     # Save uploaded file
 90 |     temp_file = UPLOAD_DIR / f"identify_{uuid.uuid4()}_{audio_file.filename}"
 91 |     try:
 92 |         with open(temp_file, "wb") as buffer:
 93 |             shutil.copyfileobj(audio_file.file, buffer)
 94 |         
 95 |         # Identify the speaker
 96 |         speaker, confidence, message = identify_speaker(str(temp_file))
 97 |         
 98 |         return {
 99 |             "speaker": speaker,
100 |             "confidence": round(float(confidence), 2),
101 |             "message": message
102 |         }
103 |     
104 |     except Exception as e:
105 |         raise HTTPException(status_code=500, detail=f"Error identifying speaker: {str(e)}")
106 |     
107 |     finally:
108 |         # Clean up
109 |         if temp_file.exists():
110 |             temp_file.unlink()
111 | 
112 | @app.post("/transcribe/")
113 | async def api_transcribe(audio_file: UploadFile = File(...)):
114 |     """Transcribe speech in an audio file"""
115 |     if not audio_file:
116 |         raise HTTPException(status_code=400, detail="No audio file provided")
117 |     
118 |     # Save uploaded file
119 |     temp_file = UPLOAD_DIR / f"transcribe_{uuid.uuid4()}_{audio_file.filename}"
120 |     try:
121 |         with open(temp_file, "wb") as buffer:
122 |             shutil.copyfileobj(audio_file.file, buffer)
123 |         
124 |         # Transcribe the audio
125 |         transcription = transcribe_audio(str(temp_file))
126 |         
127 |         return {"transcription": transcription}
128 |     
129 |     except Exception as e:
130 |         raise HTTPException(status_code=500, detail=f"Error transcribing audio: {str(e)}")
131 |     
132 |     finally:
133 |         # Clean up
134 |         if temp_file.exists():
135 |             temp_file.unlink()
136 | 
137 | @app.post("/detect-emotion/")
138 | async def api_detect_emotion(audio_file: UploadFile = File(...)):
139 |     """Detect emotion in an audio file"""
140 |     if not audio_file:
141 |         raise HTTPException(status_code=400, detail="No audio file provided")
142 |     
143 |     # Save uploaded file
144 |     temp_file = UPLOAD_DIR / f"emotion_{uuid.uuid4()}_{audio_file.filename}"
145 |     try:
146 |         with open(temp_file, "wb") as buffer:
147 |             shutil.copyfileobj(audio_file.file, buffer)
148 |         
149 |         # Detect emotion
150 |         emotion = detect_emotion(str(temp_file))
151 |         
152 |         return {"emotion": emotion}
153 |     
154 |     except Exception as e:
155 |         raise HTTPException(status_code=500, detail=f"Error detecting emotion: {str(e)}")
156 |     
157 |     finally:
158 |         # Clean up
159 |         if temp_file.exists():
160 |             temp_file.unlink()
161 | 
162 | @app.post("/text-to-speech/")
163 | async def api_text_to_speech(text: str = Form(...)):
164 |     """Convert text to speech"""
165 |     try:
166 |         # Generate a unique output path
167 |         output_file = UPLOAD_DIR / f"tts_{uuid.uuid4()}.wav"
168 |         
169 |         # Generate speech
170 |         output_path = tts(text, str(output_file))
171 |         
172 |         if output_path:
173 |             # Return the audio file
174 |             return FileResponse(
175 |                 output_path, 
176 |                 media_type="audio/wav", 
177 |                 filename=os.path.basename(output_path)
178 |             )
179 |         else:
180 |             raise HTTPException(status_code=500, detail="Failed to generate speech")
181 |     
182 |     except Exception as e:
183 |         raise HTTPException(status_code=500, detail=f"Error generating speech: {str(e)}")
184 | 
185 | @app.post("/process-audio/")
186 | async def api_process_audio(audio_file: UploadFile = File(...)):
187 |     """Process audio for speaker identification, transcription, and emotion detection"""
188 |     if not audio_file:
189 |         raise HTTPException(status_code=400, detail="No audio file provided")
190 |     
191 |     # Save uploaded file
192 |     temp_file = UPLOAD_DIR / f"process_{uuid.uuid4()}_{audio_file.filename}"
193 |     tts_output = None
194 |     
195 |     try:
196 |         with open(temp_file, "wb") as buffer:
197 |             shutil.copyfileobj(audio_file.file, buffer)
198 |         
199 |         # Process the audio
200 |         results = process_audio(str(temp_file))
201 |         tts_output = results.get('tts_output_path')
202 |         
203 |         # If TTS output exists, modify the result to include a URL
204 |         if tts_output and os.path.exists(tts_output):
205 |             results['tts_output_url'] = f"/download-tts/{os.path.basename(tts_output)}"
206 |         
207 |         return results
208 |     
209 |     except Exception as e:
210 |         raise HTTPException(status_code=500, detail=f"Error processing audio: {str(e)}")
211 |     
212 |     finally:
213 |         # Clean up the input file but keep the TTS output
214 |         if temp_file.exists():
215 |             temp_file.unlink()
216 | 
217 | @app.get("/download-tts/{filename}")
218 | async def download_tts(filename: str):
219 |     """Download a generated TTS file"""
220 |     file_path = Path(tempfile.gettempdir()) / "speaker_recognition" / filename
221 |     
222 |     if not file_path.exists():
223 |         raise HTTPException(status_code=404, detail="File not found")
224 |     
225 |     return FileResponse(
226 |         str(file_path), 
227 |         media_type="audio/wav", 
228 |         filename=filename
229 |     )
230 | 
231 | if __name__ == "__main__":
232 |     import uvicorn
233 |     uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)


--------------------------------------------------------------------------------
/local_try.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import torchaudio
  5 | import librosa
  6 | import webrtcvad
  7 | from pydub import AudioSegment
  8 | from pathlib import Path
  9 | from speechbrain.pretrained import SpeakerRecognition
 10 | from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
 11 | from parler_tts import ParlerTTSForConditionalGeneration
 12 | from transformers import AutoTokenizer
 13 | import soundfile as sf
 14 | 
 15 | # Set device to GPU if available
 16 | device = torch.device("cpu")
 17 | os.environ["HF_HUB_LOCAL_STRATEGY"] = "copy"
 18 | 
 19 | 
 20 | print("Loading models...")
 21 | spkrec = SpeakerRecognition.from_hparams(
 22 |     source="speechbrain/spkrec-ecapa-voxceleb",
 23 |     savedir="pretrained_ecapa_tdnn"
 24 | ).to(device)
 25 | 
 26 | whisper_processor = WhisperProcessor.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small")
 27 | whisper_model = WhisperForConditionalGeneration.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\openw\whisper-small")
 28 | 
 29 | emotion_classifier = pipeline("audio-classification", model=r"C:\Users\bharg\Downloads\SID&ED\ModelOpen\wavv2vec")
 30 | 
 31 | tts_model = ParlerTTSForConditionalGeneration.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\indic_parler").to(device)
 32 | tts_tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\bharg\Downloads\SID&ED\indic_parler")
 33 | description_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) # AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
 34 | 
 35 | # Constants
 36 | ENROLLMENT_DIR = Path("enrolled_speakers")
 37 | ENROLLMENT_DIR.mkdir(exist_ok=True)
 38 | EMBEDDING_DIM = 192
 39 | 
 40 | def convert_to_wav(file_path):
 41 |     #Converts audio to WAV if it's not already in WAV format.
 42 |     if file_path.lower().endswith(".wav"):
 43 |         return file_path  
 44 | 
 45 |     try:
 46 |         new_file_path = file_path.rsplit(".", 1)[0] + ".wav"
 47 |         audio = AudioSegment.from_file(file_path)
 48 |         audio = audio.set_frame_rate(16000).set_channels(1)  # Ensure 16kHz mono
 49 |         audio.export(new_file_path, format="wav")
 50 |         return new_file_path
 51 |     except Exception as e:
 52 |         print(f"Error converting {file_path} to WAV: {str(e)}")
 53 |         return None
 54 | 
 55 | def apply_vad(audio_path, aggressiveness=3):
 56 |     """Removes silence & noise using WebRTC VAD."""
 57 |     try:
 58 |         signal, sr = librosa.load(audio_path, sr=16000, mono=True)
 59 |         signal = (signal * 32767).astype(np.int16)
 60 |         vad = webrtcvad.Vad(aggressiveness)
 61 |         frame_length = int(16000 * 0.03)
 62 |         signal = signal[:len(signal) - (len(signal) % frame_length)]
 63 |         frames = np.array_split(signal, len(signal) // frame_length)
 64 |         voiced_frames = [frame for frame in frames if vad.is_speech(frame.tobytes(), 16000)]
 65 |         
 66 |         if not voiced_frames:
 67 |             return None
 68 |             
 69 |         return np.concatenate(voiced_frames)
 70 |     except Exception as e:
 71 |         print(f"Error in VAD processing: {str(e)}")
 72 |         return None
 73 |     
 74 | def extract_speaker_embedding(audio_path, spkrec_model=spkrec):
 75 |     """Extracts speaker embedding using ECAPA-TDNN model."""
 76 |     try:
 77 |         signal, sr = torchaudio.load(audio_path)
 78 |         if sr != 16000:
 79 |             signal = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(signal)
 80 |         if signal.shape[0] > 1:
 81 |             signal = signal.mean(dim=0, keepdim=True)
 82 |         if signal.shape[1] < 16000:
 83 |             return None
 84 |             
 85 |         # Device handling
 86 |         spkrec_model = spkrec_model.to(device)
 87 |         signal = signal.to(device)
 88 |         
 89 |         with torch.no_grad():
 90 |             embedding = spkrec_model.encode_batch(signal).squeeze().cpu().numpy()
 91 | 
 92 |         # Ensure consistent embedding dimension
 93 |         embedding = embedding[:EMBEDDING_DIM] if len(embedding) > EMBEDDING_DIM else np.pad(
 94 |             embedding, (0, EMBEDDING_DIM - len(embedding)), 'constant'
 95 |         )
 96 | 
 97 |         return embedding
 98 |     except Exception as e:
 99 |         print(f"Error extracting embedding: {str(e)}")
100 |         return None
101 |         
102 | def enroll_speaker(speaker_name, audio_files):
103 |     """Extract and store speaker embeddings from given audio files."""
104 |     embeddings = []
105 |     
106 |     for audio_file in audio_files:
107 |         wav_file = convert_to_wav(audio_file)
108 |         if not wav_file:
109 |             continue
110 | 
111 |         vad_audio = apply_vad(wav_file)
112 |         if vad_audio is None:
113 |             continue
114 | 
115 |         embedding = extract_speaker_embedding(wav_file)
116 |         if embedding is not None:
117 |             embeddings.append(embedding)
118 | 
119 |     if not embeddings:
120 |         print(f"Failed to enroll {speaker_name}: No valid embeddings extracted")
121 |         return False
122 | 
123 |     speaker_embedding = np.mean(np.array(embeddings), axis=0)
124 |     np.save(ENROLLMENT_DIR / f"{speaker_name}.npy", speaker_embedding)
125 |     print(f"Enrolled {speaker_name} successfully!")
126 |     return True
127 | 
128 | def identify_speaker(audio_file):
129 |     """Identify the speaker of a given audio file using cosine similarity."""
130 |     wav_file = convert_to_wav(audio_file)
131 |     if not wav_file:
132 |         return "Failed to convert audio to WAV format"
133 | 
134 |     vad_audio = apply_vad(wav_file)
135 |     if vad_audio is None:
136 |         return "No valid speech detected in audio"
137 | 
138 |     test_embedding = extract_speaker_embedding(wav_file)
139 |     if test_embedding is None:
140 |         return "Failed to extract speaker embedding"
141 | 
142 |     enrolled_speakers = {f.stem: np.load(f) for f in ENROLLMENT_DIR.glob("*.npy")}
143 | 
144 |     if not enrolled_speakers:
145 |         return "No enrolled speakers found. Please enroll speakers first."
146 | 
147 |     scores = [(name, np.dot(test_embedding, emb) / (np.linalg.norm(test_embedding) * np.linalg.norm(emb)))
148 |               for name, emb in enrolled_speakers.items()]
149 |     scores.sort(key=lambda x: x[1], reverse=True)
150 | 
151 |     best_speaker, best_score = scores[0]
152 |     second_best_score = scores[1][1] if len(scores) > 1 else 0
153 | 
154 |     if best_score > 0.5 and (best_score - second_best_score) > 0.05:
155 |         return f"{best_speaker} (confidence: {best_score:.2f})"
156 |     return "Unknown Speaker"
157 | 
158 | def transcribe_audio(audio_file):
159 |     """Transcribe speech using OpenAI Whisper."""
160 |     try:
161 |         audio_file = convert_to_wav(audio_file)
162 |         audio, _ = librosa.load(audio_file, sr=16000)
163 |         input_features = whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features
164 |         predicted_ids = whisper_model.generate(input_features)
165 |         return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
166 |     except Exception as e:
167 |         return f"Error transcribing audio: {str(e)}"
168 | 
169 | def detect_emotion(audio_file):
170 |     """Detect emotion from the audio file."""
171 |     try:
172 |         audio_file = convert_to_wav(audio_file)
173 |         result = emotion_classifier(audio_file)
174 |         return result[0]["label"]
175 |     except Exception as e:
176 |         return f"Error detecting emotion: {str(e)}"
177 |     
178 | def tts(prompt):
179 |     description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
180 |     description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
181 |     prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").to(device)
182 | 
183 |     generation = tts_model.generate(input_ids=description_input_ids.input_ids, attention_mask=description_input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask)
184 |     audio_arr = generation.cpu().numpy().squeeze()
185 |     sf.write("indic_tts_out_hindib.wav", audio_arr, tts_model.config.sampling_rate)
186 |     return "indic_tts_out_hindib.wav created successfully"
187 | 
188 | def process_audio(audio_file):
189 |     """Process the audio file to identify speaker, transcribe, and detect emotion."""
190 |     speaker = identify_speaker(audio_file)
191 |     transcription = transcribe_audio(audio_file)
192 |     emotion = detect_emotion(audio_file)   
193 |     tts(prompt=transcription)
194 |     
195 |     print("\nResults:")
196 |     print(f"Speaker: {speaker}")
197 |     print(f"Transcription: {transcription}")
198 |     print(f"Emotion: {emotion}")
199 |     print(f"TTS: {tts(prompt=transcription)}")
200 | 
201 | # Main Execution
202 | if __name__ == "__main__":
203 |     audio_directory = "Audio_Formatted"
204 |     print("Enrolling speakers...")
205 |     for speaker in os.listdir(audio_directory):
206 |         enroll_speaker(speaker, [os.path.join(audio_directory, speaker, f) for f in os.listdir(os.path.join(audio_directory, speaker))])
207 | 
208 |     test_audio = r"C:\Users\bharg\Downloads\SID&ED\Audio_Formatted\Prudhvi\Prudhvi_kumar_Surprise.wav"
209 |     process_audio(test_audio)
210 | 


--------------------------------------------------------------------------------
/speaker_recognition.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import torchaudio
  5 | import librosa
  6 | import webrtcvad
  7 | from pydub import AudioSegment
  8 | from pathlib import Path
  9 | import soundfile as sf
 10 | import tempfile
 11 | 
 12 | # Import model loading functions
 13 | from model_loader import (
 14 |     get_speaker_recognition_model,
 15 |     get_whisper_model_and_processor,
 16 |     get_emotion_classifier,
 17 |     get_tts_model_and_tokenizers
 18 | )
 19 | 
 20 | # Import database functions
 21 | from db_setup import store_speaker_embedding, get_all_speaker_embeddings, log_recognition_result
 22 | 
 23 | # Set device to GPU if available
 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 25 | os.environ["HF_HUB_LOCAL_STRATEGY"] = "copy"
 26 | 
 27 | # Load models
 28 | print("Loading models...")
 29 | spkrec = get_speaker_recognition_model()
 30 | whisper_model, whisper_processor = get_whisper_model_and_processor()
 31 | emotion_classifier = get_emotion_classifier()
 32 | tts_model, tts_tokenizer, description_tokenizer = get_tts_model_and_tokenizers()
 33 | 
 34 | # Constants
 35 | EMBEDDING_DIM = 192
 36 | TEMP_DIR = Path(tempfile.gettempdir()) / "speaker_recognition"
 37 | TEMP_DIR.mkdir(exist_ok=True)
 38 | 
 39 | def convert_to_wav(file_path):
 40 |     """Converts audio to WAV if it's not already in WAV format."""
 41 |     if file_path.lower().endswith(".wav"):
 42 |         return file_path  
 43 | 
 44 |     try:
 45 |         file_name = Path(file_path).name
 46 |         new_file_path = str(TEMP_DIR / f"{file_name.rsplit('.', 1)[0]}.wav")
 47 |         audio = AudioSegment.from_file(file_path)
 48 |         audio = audio.set_frame_rate(16000).set_channels(1)  # Ensure 16kHz mono
 49 |         audio.export(new_file_path, format="wav")
 50 |         return new_file_path
 51 |     except Exception as e:
 52 |         print(f"Error converting {file_path} to WAV: {str(e)}")
 53 |         return None
 54 | 
 55 | def apply_vad(audio_path, aggressiveness=3):
 56 |     """Removes silence & noise using WebRTC VAD."""
 57 |     try:
 58 |         signal, sr = librosa.load(audio_path, sr=16000, mono=True)
 59 |         signal = (signal * 32767).astype(np.int16)
 60 |         vad = webrtcvad.Vad(aggressiveness)
 61 |         frame_length = int(16000 * 0.03)
 62 |         signal = signal[:len(signal) - (len(signal) % frame_length)]
 63 |         frames = np.array_split(signal, len(signal) // frame_length)
 64 |         voiced_frames = [frame for frame in frames if vad.is_speech(frame.tobytes(), 16000)]
 65 |         
 66 |         if not voiced_frames:
 67 |             return None
 68 |             
 69 |         return np.concatenate(voiced_frames)
 70 |     except Exception as e:
 71 |         print(f"Error in VAD processing: {str(e)}")
 72 |         return None
 73 |     
 74 | def extract_speaker_embedding(audio_path, spkrec_model=spkrec):
 75 |     """Extracts speaker embedding using ECAPA-TDNN model."""
 76 |     try:
 77 |         signal, sr = torchaudio.load(audio_path)
 78 |         if sr != 16000:
 79 |             signal = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(signal)
 80 |         if signal.shape[0] > 1:
 81 |             signal = signal.mean(dim=0, keepdim=True)
 82 |         if signal.shape[1] < 16000:
 83 |             return None
 84 |             
 85 |         # Device handling
 86 |         spkrec_model = spkrec_model.to(device)
 87 |         signal = signal.to(device)
 88 |         
 89 |         with torch.no_grad():
 90 |             embedding = spkrec_model.encode_batch(signal).squeeze().cpu().numpy()
 91 | 
 92 |         # Ensure consistent embedding dimension
 93 |         embedding = embedding[:EMBEDDING_DIM] if len(embedding) > EMBEDDING_DIM else np.pad(
 94 |             embedding, (0, EMBEDDING_DIM - len(embedding)), 'constant'
 95 |         )
 96 | 
 97 |         return embedding
 98 |     except Exception as e:
 99 |         print(f"Error extracting embedding: {str(e)}")
100 |         return None
101 |         
102 | def enroll_speaker(speaker_name, audio_files):
103 |     """Extract and store speaker embeddings from given audio files."""
104 |     embeddings = []
105 |     
106 |     for audio_file in audio_files:
107 |         wav_file = convert_to_wav(audio_file)
108 |         if not wav_file:
109 |             continue
110 | 
111 |         vad_audio = apply_vad(wav_file)
112 |         if vad_audio is None:
113 |             continue
114 | 
115 |         embedding = extract_speaker_embedding(wav_file)
116 |         if embedding is not None:
117 |             embeddings.append(embedding)
118 | 
119 |     if not embeddings:
120 |         return False, "No valid embeddings extracted"
121 | 
122 |     speaker_embedding = np.mean(np.array(embeddings), axis=0)
123 |     
124 |     # Store in PostgreSQL database
125 |     success = store_speaker_embedding(speaker_name, speaker_embedding)
126 |     if success:
127 |         return True, f"Enrolled {speaker_name} successfully!"
128 |     return False, f"Failed to enroll {speaker_name} in database"
129 | 
130 | def identify_speaker(audio_file):
131 |     """Identify the speaker of a given audio file using cosine similarity."""
132 |     wav_file = convert_to_wav(audio_file)
133 |     if not wav_file:
134 |         return "Unknown Speaker", 0.0, "Failed to convert audio to WAV format"
135 | 
136 |     vad_audio = apply_vad(wav_file)
137 |     if vad_audio is None:
138 |         return "Unknown Speaker", 0.0, "No valid speech detected in audio"
139 | 
140 |     test_embedding = extract_speaker_embedding(wav_file)
141 |     if test_embedding is None:
142 |         return "Unknown Speaker", 0.0, "Failed to extract speaker embedding"
143 | 
144 |     # Get all enrolled speakers from database
145 |     enrolled_speakers = get_all_speaker_embeddings()
146 | 
147 |     if not enrolled_speakers:
148 |         return "Unknown Speaker", 0.0, "No enrolled speakers found. Please enroll speakers first."
149 | 
150 |     scores = [(name, np.dot(test_embedding, emb) / (np.linalg.norm(test_embedding) * np.linalg.norm(emb)))
151 |               for name, emb in enrolled_speakers.items()]
152 |     scores.sort(key=lambda x: x[1], reverse=True)
153 | 
154 |     best_speaker, best_score = scores[0]
155 |     second_best_score = scores[1][1] if len(scores) > 1 else 0
156 | 
157 |     if best_score > 0.5 and (best_score - second_best_score) > 0.05:
158 |         return best_speaker, best_score, "Speaker identified successfully"
159 |     return "Unknown Speaker", best_score, "Speaker confidence too low"
160 | 
161 | def transcribe_audio(audio_file):
162 |     """Transcribe speech using Whisper."""
163 |     try:
164 |         audio_file = convert_to_wav(audio_file)
165 |         audio, _ = librosa.load(audio_file, sr=16000)
166 |         input_features = whisper_processor(audio, sampling_rate=16000, return_tensors="pt").input_features
167 |         
168 |         # Move to device
169 |         input_features = input_features.to(device)
170 |         whisper_model.to(device)
171 |         
172 |         predicted_ids = whisper_model.generate(input_features)
173 |         return whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
174 |     except Exception as e:
175 |         return f"Error transcribing audio: {str(e)}"
176 | 
177 | def detect_emotion(audio_file):
178 |     """Detect emotion from the audio file."""
179 |     try:
180 |         audio_file = convert_to_wav(audio_file)
181 |         result = emotion_classifier(audio_file)
182 |         return result[0]["label"]
183 |     except Exception as e:
184 |         return f"Error detecting emotion: {str(e)}"
185 |     
186 | def tts(prompt, output_path=None):
187 |     """Text to speech conversion."""
188 |     try:
189 |         description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
190 |         description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
191 |         prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").to(device)
192 | 
193 |         generation = tts_model.generate(
194 |             input_ids=description_input_ids.input_ids, 
195 |             attention_mask=description_input_ids.attention_mask, 
196 |             prompt_input_ids=prompt_input_ids.input_ids, 
197 |             prompt_attention_mask=prompt_input_ids.attention_mask
198 |         )
199 |         
200 |         audio_arr = generation.cpu().numpy().squeeze()
201 |         
202 |         if output_path is None:
203 |             output_path = str(TEMP_DIR / "indic_tts_output.wav")
204 |             
205 |         sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
206 |         return output_path
207 |     except Exception as e:
208 |         print(f"Error in TTS: {str(e)}")
209 |         return None
210 | 
211 | def process_audio(audio_file):
212 |     """Process the audio file to identify speaker, transcribe, and detect emotion."""
213 |     speaker, confidence, message = identify_speaker(audio_file)
214 |     transcription = transcribe_audio(audio_file)
215 |     emotion = detect_emotion(audio_file)   
216 |     tts_output_path = tts(prompt=transcription)
217 |     
218 |     # Log results to database
219 |     log_recognition_result(audio_file, speaker, confidence, transcription, emotion)
220 |     
221 |     results = {
222 |         "speaker": speaker,
223 |         "confidence": round(float(confidence), 2) if isinstance(confidence, (float, int)) else 0.0,
224 |         "transcription": transcription,
225 |         "emotion": emotion,
226 |         "tts_output_path": tts_output_path,
227 |         "message": message
228 |     }
229 |     
230 |     return results


--------------------------------------------------------------------------------