├── audio_processing ├── __init__.py ├── whisper_api.py └── recording.py ├── llm_recognition ├── __init__.py └── recognizer.py ├── requirements.txt ├── .gitignore ├── main.py └── README.md /audio_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm_recognition/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pytest 3 | openai 4 | openai-whisper 5 | sounddevice 6 | numpy 7 | wavio 8 | pydub 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.pyc 4 | *.pyo 5 | *.egg-info/ 6 | *.egg 7 | 8 | # Development environment 9 | *.log 10 | *.sql 11 | *.sqlite 12 | 13 | # Virtual environments 14 | venv/ 15 | *.venv 16 | venv.bak/ 17 | *.venv.bak 18 | 19 | # Audio files 20 | *.mp3 21 | *.wav 22 | 23 | -------------------------------------------------------------------------------- /audio_processing/whisper_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | 4 | def process_audio_with_whisper(audio_path): 5 | """ 6 | Processes the given audio using the Whisper API. 7 | 8 | Args: 9 | - audio_path (str): Path to the audio file to process. 10 | 11 | Returns: 12 | - str: Processed audio or transcription. 13 | """ 14 | model_id = 'whisper-1' 15 | 16 | audio_file = open(audio_path, 'rb') 17 | 18 | response = openai.Audio.transcribe( 19 | api_key=os.getenv('OPENAI_API_KEY'), 20 | model=model_id, 21 | file=audio_file 22 | ) 23 | 24 | return response['text'] -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from audio_processing import recording, whisper_api 2 | from llm_recognition import recognizer 3 | 4 | def main(): 5 | print("Starting...") 6 | # Capture audio from the user 7 | print("Capturing audio...") 8 | audio_path = recording.capture_audio() 9 | 10 | print(f"Audio captured to: {audio_path}") 11 | # Process audio using Whisper 12 | processed_audio = whisper_api.process_audio_with_whisper(audio_path) 13 | 14 | print(f"Transcript: {processed_audio}") 15 | 16 | # Recognize song using LLM 17 | song_name = recognizer.recognize_song(processed_audio) 18 | 19 | if song_name: 20 | print(f"Recognized song: {song_name}") 21 | else: 22 | print("Song not recognized.") 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /audio_processing/recording.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sounddevice as sd 3 | import numpy as np 4 | from pydub import AudioSegment 5 | 6 | def capture_audio(duration=10, filename="recorded_audio.mp3", rate=44100): 7 | """ 8 | Captures audio from the user's microphone for the specified duration and saves as MP3. 9 | 10 | Args: 11 | - duration (int): Duration in seconds for which to capture audio. 12 | - filename (str): Name of the file where the audio will be saved. 13 | - rate (int): Sampling rate. 14 | 15 | Returns: 16 | - str: Path to the saved audio file. 17 | """ 18 | 19 | # Capture audio 20 | audio_data = sd.rec(int(duration * rate), samplerate=rate, channels=2, dtype=np.int16) 21 | sd.wait() # Wait for the recording to finish 22 | 23 | # Convert numpy array to audio segment 24 | audio_segment = AudioSegment( 25 | audio_data.tobytes(), 26 | frame_rate=rate, 27 | sample_width=audio_data.dtype.itemsize, 28 | channels=2 29 | ) 30 | 31 | # Save audio segment to MP3 32 | audio_segment.export(filename, format="mp3") 33 | 34 | return filename 35 | -------------------------------------------------------------------------------- /llm_recognition/recognizer.py: -------------------------------------------------------------------------------- 1 | from openai import ChatCompletion 2 | 3 | def recognize_song(processed_audio): 4 | """ 5 | Recognizes the song from the processed audio using a large language model (LLM). 6 | 7 | Args: 8 | - processed_audio (str): Processed audio or transcription. 9 | 10 | Returns: 11 | - str: Recognized song name or None if not recognized. 12 | """ 13 | system_prompt = "You are an expert in songs with perfect knowledge of titles, artists, release dates, and lyrics." 14 | 15 | try: 16 | response = ChatCompletion.create( 17 | model="gpt-4", 18 | messages=[ 19 | {"role": "system", "content": system_prompt}, 20 | {"role": "user", "content": f"What song is this text from: \"{processed_audio}\". Output only the song title and artist name (when applicable) and nothing else."}, 21 | ] 22 | ) 23 | 24 | if 'choices' in response and response['choices'] and 'message' in response['choices'][0]: 25 | return response['choices'][0]['message']['content'] 26 | else: 27 | return None 28 | except Exception as e: 29 | print(f"Error: {e}") 30 | return None 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Song Solver 2 | 3 | A Python application that allows users to sing in front of their laptop's microphone, processes the recording using the Whisper API, and then leverages a Large Language Model (LLM) to recognize the song. 4 | 5 | ## Installation 6 | 7 | 1. Clone this repository: 8 | ``` 9 | git clone https://github.com/przadka/song-solver.git 10 | ``` 11 | 12 | 2. Navigate to the project directory: 13 | ``` 14 | cd song-solver 15 | ``` 16 | 17 | 3. Create and activate a virtual environment (optional, but recommended): 18 | ```bash 19 | python3 -m venv venv 20 | source venv/bin/activate # On Windows use `venv\Scripts\activate` 21 | ``` 22 | 23 | 4. Install the required dependencies: 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ## Prerequisites 29 | 30 | ### Environment Variables 31 | 32 | The program expects an environment variable named `OPENAI_API_KEY` to be set. This key is used to authenticate with the OpenAI API for song recognition. 33 | 34 | To set the environment variable: 35 | 36 | #### On Linux/macOS: 37 | 38 | ```bash 39 | export OPENAI_API_KEY=your_api_key_here 40 | ``` 41 | 42 | #### On Windows (Command Prompt): 43 | 44 | ```bash 45 | set OPENAI_API_KEY=your_api_key_here 46 | ``` 47 | 48 | #### On Windows (PowerShell): 49 | 50 | ```bash 51 | $env:OPENAI_API_KEY = "your_api_key_here" 52 | ``` 53 | 54 | Replace `your_api_key_here` with your actual OpenAI API key. 55 | 56 | After setting the environment variable, you can run the program as described in the [Usage](#usage) section. 57 | 58 | ## Usage 59 | 60 | To run the program: 61 | ```bash 62 | python main.py 63 | ``` 64 | Follow the on-screen prompts to record your song. The application will then attempt to recognize the song based on the provided recording. 65 | 66 | ## License 67 | 68 | Open-source software licensed under the MIT License. 69 | 70 | --------------------------------------------------------------------------------