├── audio_processing
    ├── __init__.py
    ├── whisper_api.py
    └── recording.py
├── llm_recognition
    ├── __init__.py
    └── recognizer.py
├── requirements.txt
├── .gitignore
├── main.py
└── README.md


/audio_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm_recognition/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | pytest
 3 | openai
 4 | openai-whisper
 5 | sounddevice
 6 | numpy
 7 | wavio
 8 | pydub
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.pyc
 4 | *.pyo
 5 | *.egg-info/
 6 | *.egg
 7 | 
 8 | # Development environment
 9 | *.log
10 | *.sql
11 | *.sqlite
12 | 
13 | # Virtual environments
14 | venv/
15 | *.venv
16 | venv.bak/
17 | *.venv.bak
18 | 
19 | # Audio files
20 | *.mp3
21 | *.wav
22 | 
23 | 


--------------------------------------------------------------------------------
/audio_processing/whisper_api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import openai
 3 | 
 4 | def process_audio_with_whisper(audio_path):
 5 |     """
 6 |     Processes the given audio using the Whisper API.
 7 |     
 8 |     Args:
 9 |     - audio_path (str): Path to the audio file to process.
10 |     
11 |     Returns:
12 |     - str: Processed audio or transcription.
13 |     """
14 |     model_id = 'whisper-1'
15 | 
16 |     audio_file = open(audio_path, 'rb')
17 | 
18 |     response = openai.Audio.transcribe(
19 |         api_key=os.getenv('OPENAI_API_KEY'),
20 |         model=model_id,
21 |         file=audio_file
22 |     )
23 | 
24 |     return response['text']


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from audio_processing import recording, whisper_api
 2 | from llm_recognition import recognizer
 3 | 
 4 | def main():
 5 |     print("Starting...")
 6 |     # Capture audio from the user
 7 |     print("Capturing audio...")
 8 |     audio_path = recording.capture_audio()
 9 | 
10 |     print(f"Audio captured to: {audio_path}")
11 |     # Process audio using Whisper
12 |     processed_audio = whisper_api.process_audio_with_whisper(audio_path)
13 | 
14 |     print(f"Transcript: {processed_audio}")
15 | 
16 |     # Recognize song using LLM
17 |     song_name = recognizer.recognize_song(processed_audio)
18 |     
19 |     if song_name:
20 |         print(f"Recognized song: {song_name}")
21 |     else:
22 |         print("Song not recognized.")
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/audio_processing/recording.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sounddevice as sd
 3 | import numpy as np
 4 | from pydub import AudioSegment
 5 | 
 6 | def capture_audio(duration=10, filename="recorded_audio.mp3", rate=44100):
 7 |     """
 8 |     Captures audio from the user's microphone for the specified duration and saves as MP3.
 9 |     
10 |     Args:
11 |     - duration (int): Duration in seconds for which to capture audio.
12 |     - filename (str): Name of the file where the audio will be saved.
13 |     - rate (int): Sampling rate.
14 |     
15 |     Returns:
16 |     - str: Path to the saved audio file.
17 |     """
18 |     
19 |     # Capture audio
20 |     audio_data = sd.rec(int(duration * rate), samplerate=rate, channels=2, dtype=np.int16)
21 |     sd.wait()  # Wait for the recording to finish
22 | 
23 |     # Convert numpy array to audio segment
24 |     audio_segment = AudioSegment(
25 |         audio_data.tobytes(),
26 |         frame_rate=rate,
27 |         sample_width=audio_data.dtype.itemsize,
28 |         channels=2
29 |     )
30 | 
31 |     # Save audio segment to MP3
32 |     audio_segment.export(filename, format="mp3")
33 | 
34 |     return filename
35 | 


--------------------------------------------------------------------------------
/llm_recognition/recognizer.py:
--------------------------------------------------------------------------------
 1 | from openai import ChatCompletion
 2 | 
 3 | def recognize_song(processed_audio):
 4 |     """
 5 |     Recognizes the song from the processed audio using a large language model (LLM).
 6 |     
 7 |     Args:
 8 |     - processed_audio (str): Processed audio or transcription.
 9 |     
10 |     Returns:
11 |     - str: Recognized song name or None if not recognized.
12 |     """
13 |     system_prompt = "You are an expert in songs with perfect knowledge of titles, artists, release dates, and lyrics."
14 | 
15 |     try:
16 |         response = ChatCompletion.create(
17 |             model="gpt-4",
18 |             messages=[
19 |                 {"role": "system", "content": system_prompt},
20 |                 {"role": "user", "content": f"What song is this text from: \"{processed_audio}\". Output only the song title and artist name (when applicable) and nothing else."},
21 |             ]
22 |         )
23 | 
24 |         if 'choices' in response and response['choices'] and 'message' in response['choices'][0]:
25 |             return response['choices'][0]['message']['content']
26 |         else:
27 |             return None
28 |     except Exception as e:
29 |         print(f"Error: {e}")
30 |         return None
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Song Solver
 2 | 
 3 | A Python application that allows users to sing in front of their laptop's microphone, processes the recording using the Whisper API, and then leverages a Large Language Model (LLM) to recognize the song.
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Clone this repository:
 8 |    ```
 9 |    git clone https://github.com/przadka/song-solver.git
10 |    ```
11 | 
12 | 2. Navigate to the project directory:
13 |    ```
14 |    cd song-solver
15 |    ```
16 | 
17 | 3. Create and activate a virtual environment (optional, but recommended):
18 |    ```bash
19 |    python3 -m venv venv
20 |    source venv/bin/activate  # On Windows use `venv\Scripts\activate`
21 |    ```
22 | 
23 | 4. Install the required dependencies:
24 |    ```
25 |    pip install -r requirements.txt
26 |    ```
27 | 
28 | ## Prerequisites
29 | 
30 | ### Environment Variables
31 | 
32 | The program expects an environment variable named `OPENAI_API_KEY` to be set. This key is used to authenticate with the OpenAI API for song recognition.
33 | 
34 | To set the environment variable:
35 | 
36 | #### On Linux/macOS:
37 | 
38 | ```bash
39 | export OPENAI_API_KEY=your_api_key_here
40 | ```
41 | 
42 | #### On Windows (Command Prompt):
43 | 
44 | ```bash
45 | set OPENAI_API_KEY=your_api_key_here
46 | ```
47 | 
48 | #### On Windows (PowerShell):
49 | 
50 | ```bash
51 | $env:OPENAI_API_KEY = "your_api_key_here"
52 | ```
53 | 
54 | Replace `your_api_key_here` with your actual OpenAI API key.
55 | 
56 | After setting the environment variable, you can run the program as described in the [Usage](#usage) section.
57 | 
58 | ## Usage
59 | 
60 | To run the program:
61 | ```bash
62 | python main.py
63 | ```
64 | Follow the on-screen prompts to record your song. The application will then attempt to recognize the song based on the provided recording.
65 | 
66 | ## License
67 | 
68 | Open-source software licensed under the MIT License.
69 | 
70 | 


--------------------------------------------------------------------------------