├── notebooks ├── img │ ├── models.png │ ├── today.png │ ├── comp-nec.png │ ├── explore.png │ ├── unix-phil.png │ ├── ai-evolution.png │ ├── composability.png │ ├── importance-comp.png │ ├── karpathy-austen.png │ ├── multimodal_app_1.png │ ├── multimodal_app_2.png │ └── apple-counterpoint.png ├── vid │ ├── karpathy-austen.mp4 │ └── Dream Advertising Marketing Campaign.mp4 └── models-everywhere.ipynb ├── multimodal-app ├── .streamlit │ └── secrets.toml ├── README.md ├── pages │ ├── 00_Text_Generation_Evals.py │ ├── 01_Audio_Generation_Evals.py │ ├── 02_Image_Generation_Evals.py │ ├── 03_Video_Generation_Evals.py │ └── 04_Transcription_Evals.py ├── constants.py ├── utils.py ├── main.py ├── ui.py └── api.py ├── pyproject.toml ├── LICENSE ├── .gitignore └── README.md /notebooks/img/models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/models.png -------------------------------------------------------------------------------- /notebooks/img/today.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/today.png -------------------------------------------------------------------------------- /notebooks/img/comp-nec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/comp-nec.png -------------------------------------------------------------------------------- /notebooks/img/explore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/explore.png -------------------------------------------------------------------------------- /notebooks/img/unix-phil.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/unix-phil.png -------------------------------------------------------------------------------- /notebooks/img/ai-evolution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/ai-evolution.png -------------------------------------------------------------------------------- /notebooks/img/composability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/composability.png -------------------------------------------------------------------------------- /multimodal-app/.streamlit/secrets.toml: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY = 'XXX' 2 | OPENAI_API_KEY = 'XXX' 3 | REPLICATE_API_TOKEN = 'XXX' 4 | HF_API_KEY = 'XXX' -------------------------------------------------------------------------------- /notebooks/img/importance-comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/importance-comp.png -------------------------------------------------------------------------------- /notebooks/img/karpathy-austen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/karpathy-austen.png -------------------------------------------------------------------------------- /notebooks/img/multimodal_app_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/multimodal_app_1.png -------------------------------------------------------------------------------- /notebooks/img/multimodal_app_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/multimodal_app_2.png -------------------------------------------------------------------------------- /notebooks/vid/karpathy-austen.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/vid/karpathy-austen.mp4 -------------------------------------------------------------------------------- /notebooks/img/apple-counterpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/apple-counterpoint.png -------------------------------------------------------------------------------- /multimodal-app/README.md: -------------------------------------------------------------------------------- 1 | Before install Python dependencies, you need to install `portaudio` library. 2 | 3 | ```bash 4 | brew install portaudio 5 | ``` -------------------------------------------------------------------------------- /multimodal-app/pages/00_Text_Generation_Evals.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Text Generation Evals") 4 | st.dataframe(st.session_state["text_gen_evals_df"]) 5 | -------------------------------------------------------------------------------- /multimodal-app/pages/01_Audio_Generation_Evals.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Audio Generation Evals") 4 | st.dataframe(st.session_state["audio_gen_evals_df"]) 5 | -------------------------------------------------------------------------------- /multimodal-app/pages/02_Image_Generation_Evals.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Image Generation Evals") 4 | st.dataframe(st.session_state["image_gen_evals_df"]) 5 | -------------------------------------------------------------------------------- /multimodal-app/pages/03_Video_Generation_Evals.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Video Generation Evals") 4 | st.dataframe(st.session_state["video_gen_evals_df"]) 5 | -------------------------------------------------------------------------------- /multimodal-app/pages/04_Transcription_Evals.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | st.markdown("# Transcription Evals") 4 | st.dataframe(st.session_state["transcription_evals_df"]) 5 | -------------------------------------------------------------------------------- /notebooks/vid/Dream Advertising Marketing Campaign.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/vid/Dream Advertising Marketing Campaign.mp4 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "experiments-in-ai" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["hugobowne "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | jupyter = "^1.0.0" 11 | pandas = "^2.2.2" 12 | matplotlib = "^3.9.0" 13 | seaborn = "^0.13.2" 14 | transformers = "^4.41.2" 15 | torch = "^2.3.1" 16 | scikit-learn = "^1.5.0" 17 | datasets = "^2.19.2" 18 | diffusers = "^0.28.2" 19 | openai = "^1.33.0" 20 | pip = "^24.0" 21 | ai21 = "^2.4.2" 22 | replicate = "^0.26.0" 23 | streamlit = "^1.35.0" 24 | huggingface-hub = "^0.23.3" 25 | audio-recorder-streamlit = "^0.0.10" 26 | scipy = "^1.14.0" 27 | groq = "^0.9.0" 28 | 29 | 30 | [build-system] 31 | requires = ["poetry-core"] 32 | build-backend = "poetry.core.masonry.api" 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Hugo Bowne-Anderson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /multimodal-app/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DEFAULT_TEXT_GEN_SYSTEM_PROMPT = ( 4 | "You are a master storyteller, songwriter, and creator in a world where words shape reality. Your purpose is to generate responses that are imaginative, vivid, and captivating. Whether the user provides a simple prompt, a detailed scenario, or a fantastical idea, you will craft a response that brings their song to life in an entertaining and engaging way. Be creative, be descriptive, and always aim to surprise and delight with your short and rhythmic responses. Write a four line poem based on the user prompt, use adlibs, and make it fun and full of ♪ symbols to help downstream models know you are singing!" 5 | ) 6 | DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT = "Sad, dark, and gloomy image." 7 | 8 | # Set up API URLs and headers 9 | # HF_BARK_ENDPOINT = "https://api-inference.huggingface.co/models/suno/bark" 10 | # bark_api_headers = {"Authorization": f"Bearer {os.environ['HF_API_KEY']}"} 11 | 12 | REPLICATE_IMAGE_MODEL_ID_LS = [ 13 | "black-forest-labs/flux-dev", 14 | "stability-ai/stable-diffusion-3", 15 | ] 16 | REPLICATE_VIDEO_MODEL_ID_LS = [ 17 | "lucataco/hotshot-xl:78b3a6257e16e4b241245d65c8b2b81ea2e1ff7ed4c55306b511509ddbfd327a", 18 | "deforum/deforum_stable_diffusion:e22e77495f2fb83c34d5fae2ad8ab63c0a87b6b573b6208e1535b23b89ea66d6", 19 | ] 20 | 21 | # Sinks 22 | AUDIO_DATA_SINK = os.path.join(os.path.dirname(__file__), "audio") -------------------------------------------------------------------------------- /multimodal-app/utils.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | from constants import DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT 4 | 5 | def init_session_state(): 6 | if "text" not in st.session_state: 7 | st.session_state["text"] = None 8 | if "text_gen_stream_resp" not in st.session_state: 9 | st.session_state["text_gen_stream_resp"] = None 10 | if "text_gen_evals_df" not in st.session_state: 11 | st.session_state["text_gen_evals_df"] = pd.DataFrame() 12 | if "user_audio_bytes" not in st.session_state: 13 | st.session_state["user_audio_bytes"] = None 14 | if "llm_audio_bytes" not in st.session_state: 15 | st.session_state["llm_audio_bytes"] = None 16 | if "audio_gen_evals_df" not in st.session_state: 17 | st.session_state["audio_gen_evals_df"] = pd.DataFrame() 18 | if "user_image_url" not in st.session_state: 19 | st.session_state["user_image_url"] = None 20 | if "llm_image_url" not in st.session_state: 21 | st.session_state["llm_image_url"] = None 22 | if "image_gen_evals_df" not in st.session_state: 23 | st.session_state["image_gen_evals_df"] = pd.DataFrame() 24 | if 'negative_prompt' not in st.session_state: 25 | st.session_state['negative_prompt'] = DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT 26 | if "user_video_url" not in st.session_state: 27 | st.session_state["user_video_url"] = None 28 | if "llm_video_url" not in st.session_state: 29 | st.session_state["llm_video_url"] = None 30 | if "video_gen_evals_df" not in st.session_state: 31 | st.session_state["video_gen_evals_df"] = pd.DataFrame() 32 | if "transcription_evals_df" not in st.session_state: 33 | st.session_state["transcription_evals_df"] = pd.DataFrame() 34 | if "tasks" not in st.session_state: 35 | st.session_state['tasks'] = [] 36 | st.session_state["running_text_job"] = False 37 | st.session_state["running_audio_job"] = False 38 | st.session_state["running_image_job"] = False 39 | st.session_state["running_video_job"] = False 40 | 41 | def show_quick_reset_option(col_handler): 42 | if col_handler.button("Reset session data"): 43 | st.session_state["text"] = None 44 | st.session_state["text_gen_stream_resp"] = None 45 | st.session_state["user_audio_bytes"] = None 46 | st.session_state["llm_audio_bytes"] = None 47 | st.session_state["user_image_url"] = None 48 | st.session_state["llm_image_url"] = None 49 | st.session_state['negative_prompt'] = DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT 50 | st.session_state["user_video_url"] = None 51 | st.session_state["llm_video_url"] = None 52 | st.session_state["running_text_job"] = False 53 | st.session_state["running_audio_job"] = False 54 | st.session_state["running_image_job"] = False 55 | st.session_state["running_video_job"] = False 56 | st.rerun() 57 | 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /multimodal-app/main.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import os 3 | import asyncio 4 | import streamlit as st 5 | 6 | from constants import * 7 | from ui import ( 8 | set_users_initial_prompt, 9 | display_user_info, 10 | display_llm_info, 11 | display_audio_section, 12 | display_image_section, 13 | display_video_section, 14 | ) 15 | from api import ( 16 | generate_text, 17 | text_to_audio, 18 | text_to_image, 19 | text_to_video 20 | ) 21 | from utils import init_session_state, show_quick_reset_option 22 | 23 | st.set_page_config(page_title="Audio Chat", page_icon="🎤", layout="wide") 24 | 25 | async def background_tasks(placeholder): 26 | while True: 27 | _n_complete = sum([t.done() for t in st.session_state.tasks]) 28 | with placeholder: 29 | st.write(f'Completed `{_n_complete}` of `{len(st.session_state.tasks)}` generations. 🚨 Starting other tasks will erase this queue 🚨') 30 | if _n_complete != len(st.session_state.tasks): 31 | await asyncio.sleep(1.2) 32 | else: 33 | st.rerun() 34 | break 35 | 36 | async def main(): 37 | 38 | if 'text' not in st.session_state: 39 | init_session_state() 40 | 41 | left_column, right_column = st.columns(2) 42 | left_column.title("User input") 43 | right_column.title("AI generations") 44 | placeholder = right_column.empty() 45 | 46 | if st.session_state["text"] is None: 47 | left_column.subheader("Text generation") 48 | openai_model_ls = ["openai:gpt-3.5-turbo", "openai:gpt-4o-mini", "openai:gpt-4o"] 49 | groq_model_ls = ["groq:llama-3.1-70b-versatile", "groq:mixtral-8x7b-32768", "groq:gemma2-9b-it"] 50 | model_choice = left_column.selectbox("Initial text reply model", groq_model_ls + openai_model_ls) 51 | model_choice_ls = model_choice.split(':') 52 | st.session_state.init_model_provider, st.session_state.init_model = model_choice_ls[0], model_choice_ls[1] 53 | st.session_state.text_gen_sys_prompt = left_column.text_area("System prompt", DEFAULT_TEXT_GEN_SYSTEM_PROMPT) 54 | set_users_initial_prompt(left_column) 55 | else: 56 | show_quick_reset_option(left_column) 57 | left_column.subheader("Pick your poison") 58 | st.session_state.image_model = left_column.selectbox("Image model", REPLICATE_IMAGE_MODEL_ID_LS) 59 | st.session_state.video_model = left_column.selectbox("Video model", REPLICATE_VIDEO_MODEL_ID_LS) 60 | 61 | _generate_text = partial(generate_text, st.session_state["text"], st.session_state.init_model) 62 | 63 | if st.session_state["text"]: 64 | display_user_info(right_column) 65 | display_llm_info(right_column, _generate_text) 66 | if left_column.button('Run all tasks concurrently'): 67 | st.session_state.tasks = [ 68 | asyncio.create_task(text_to_image(st.session_state["text"], st.session_state['negative_prompt'], src="user")), 69 | asyncio.create_task(text_to_image(st.session_state["text_gen_stream_resp"],st.session_state['negative_prompt'], src="llm")), 70 | asyncio.create_task(text_to_video(st.session_state["text"], src="user")), 71 | asyncio.create_task(text_to_video(st.session_state["text_gen_stream_resp"], src="llm")), 72 | asyncio.create_task(text_to_audio(st.session_state["text"], "user")), 73 | asyncio.create_task(text_to_audio(st.session_state["text_gen_stream_resp"], "llm")), 74 | ] 75 | await background_tasks(placeholder) 76 | st.rerun() 77 | else: 78 | audio_task = display_audio_section(left_column) 79 | image_task = display_image_section(left_column) 80 | video_task = display_video_section(left_column) 81 | st.session_state.tasks = list(filter(lambda x: x is not None, [audio_task, image_task, video_task])) 82 | if st.session_state.tasks: 83 | await background_tasks(placeholder) 84 | st.rerun() 85 | 86 | asyncio.run(main()) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building Your First Multimodal Gen AI App 🚀 2 | 3 | ## Introduction 4 | 5 | Welcome to the tutorial on building your first multimodal generative AI (Gen AI) app! This repository contains all the resources and code you need to get started with creating an app that can generate text, audio, images, and videos using various AI models and APIs. 6 | 7 | ## Prerequisites 8 | 9 | Before you begin, make sure you have the following: 10 | 11 | - A GitHub account 12 | - GitHub Codespaces enabled (comes with your GitHub account) 13 | - API keys for the following: 14 | - [Groq](https://groq.com/) or [OpenAI](https://platform.openai.com/playground) (at least one is required; Groq has a free tier for all the models we need!) 15 | - [Replicate](https://replicate.com/) (necessary for full functionality; Replicate has kindly provided credits for those taking this workshop at a conference.) 16 | - Basic knowledge of Python and Bash 17 | 18 | **Note about GitHub Codespaces:** 19 | - GitHub Codespaces is included with every GitHub account. 20 | - There's a substantial monthly free tier for personal accounts (120 core hours/month as of 2024). 21 | - If you exceed the free tier, you may need to purchase additional usage. 22 | - For the latest information on GitHub Codespaces pricing and usage limits, please check the [official GitHub documentation](https://docs.github.com/en/billing/managing-billing-for-github-codespaces/about-billing-for-github-codespaces). 23 | 24 | **Note for Workshop Participants:** If you are taking this workshop at a conference or other event, please check with your instructors or teachers to see if they are providing the API keys for you. 25 | 26 | ## Setting Up the Environment 27 | 28 | To get up and running, you can watch the video below and/or follow the instructions (instructions at around 1:50 after motivating demo): 29 | 30 | 31 | 32 | https://github.com/user-attachments/assets/342b5ee3-c067-4407-9d51-96fab6488863 33 | 34 | 35 | 36 | 37 | ### Creating a GitHub Codespace 38 | 39 | 1. Open the repository in GitHub. 40 | 2. Click on the `Code` button and select `Create codespace on main`. 41 | 3. Wait for the Codespace to spin up (this should take about 2 minutes). 42 | 43 | ### Adding API Keys 44 | 45 | 1. In the Codespace, navigate to the `.streamlit` directory inside the `multimodal_app` folder. 46 | 2. Open the `secrets.toml` file. 47 | 3. Add your API keys as follows: 48 | ```toml 49 | OPENAI_API_KEY = "your_openai_api_key" 50 | GROQ_API_KEY = "your_groq_api_key" 51 | REPLICATE_API_TOKEN = "your_replicate_api_token" 52 | ``` 53 | 4. Save the file and ensure these keys are kept private and secure. 54 | 55 | ### API Keys 56 | 57 | You'll need API keys for either OpenAI or Groq, and Replicate (for full functionality). 58 | 59 | ### Setting Up the Poetry Environment 60 | 61 | 1. Once the Codespace finishes configuring, it will automatically install Poetry. 62 | 2. In the Codespace terminal, activate the Poetry environment: 63 | ```bash 64 | cd multimodal_app 65 | poetry shell 66 | ``` 67 | 68 | ## Running the Application 69 | 70 | To run the Streamlit app: 71 | 72 | 1. Ensure you're in the `multimodal_app` directory and have activated the Poetry shell. 73 | 2. Run the following command: 74 | ```bash 75 | streamlit run main.py 76 | ``` 77 | 3. Click "Open in browser" when prompted to view the app. 78 | 79 | ## Using the Application 80 | 81 | The multimodal Gen AI app allows you to: 82 | 83 | 1. Record speech or type text input. 84 | 2. Transcribe speech to text. 85 | 3. Generate text responses based on your input. 86 | 4. Create audio versions of the text. 87 | 5. Generate images based on the content. 88 | 6. Create videos incorporating the generated content. 89 | 90 | To use the app: 91 | 92 | 1. Click the record button to speak, or type your input. 93 | 2. Click "Transcribe" to convert speech to text (if applicable). 94 | 3. Choose to run all tasks concurrently or step-by-step. 95 | 4. Explore the generated text, audio, images, and videos. 96 | 97 | ## Troubleshooting 98 | 99 | If you encounter any issues: 100 | 101 | - Ensure all API keys are correctly entered in the `secrets.toml` file. 102 | - Check that you're in the correct directory (`multimodal_app`) when running commands. 103 | - Verify that all dependencies are installed by running `poetry install` if needed. 104 | 105 | ## Contributing 106 | 107 | We welcome contributions to improve this project! Please feel free to submit issues or pull requests. 108 | 109 | --- 110 | 111 | Happy building! We hope you enjoy creating your first multimodal Gen AI app. If you have any questions or feedback, please don't hesitate to reach out. 112 | -------------------------------------------------------------------------------- /multimodal-app/ui.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import streamlit as st 3 | from audio_recorder_streamlit import audio_recorder 4 | from api import transcribe_audio, text_to_audio, text_to_image, text_to_video 5 | from constants import * 6 | 7 | def set_users_initial_prompt(col_handler): 8 | text = col_handler.chat_input("Say something") 9 | st.session_state["text"] = text 10 | st.session_state.user_init_audio_bytes = audio_recorder( 11 | recording_color="#e8b62c", 12 | neutral_color="#6aa36f", 13 | icon_name="microphone", 14 | icon_size="2x", 15 | ) 16 | if "user_init_audio_bytes" in st.session_state and st.session_state.user_init_audio_bytes is not None: 17 | col_handler.audio(data=st.session_state["user_init_audio_bytes"], format="audio/wav") 18 | if col_handler.button("Transcribe"): 19 | transcription = transcribe_audio(st.session_state["user_init_audio_bytes"]) 20 | st.session_state["text"] = transcription.text 21 | 22 | def display_user_info(col_handler): 23 | with col_handler.chat_message("user", avatar="👤"): 24 | col_handler.write(st.session_state["text"]) 25 | if st.session_state["user_audio_bytes"] is not None: 26 | col_handler.audio(st.session_state["user_audio_bytes"]) 27 | if st.session_state["user_image_url"] is not None: 28 | col_handler.image(st.session_state["user_image_url"]) 29 | if st.session_state["user_video_url"] is not None: 30 | col_handler.video(st.session_state["user_video_url"]) 31 | 32 | def display_llm_info(col_handler, f): 33 | with col_handler.chat_message("ai", avatar="🤖"): 34 | if st.session_state["text_gen_stream_resp"] is None: 35 | st.session_state["text_gen_stream_resp"] = col_handler.write_stream( 36 | f 37 | ) 38 | st.rerun() 39 | else: 40 | col_handler.write(st.session_state["text_gen_stream_resp"]) 41 | if st.session_state["llm_audio_bytes"] is not None: 42 | col_handler.audio(st.session_state["llm_audio_bytes"]) 43 | if st.session_state["llm_image_url"] is not None: 44 | col_handler.image(st.session_state["llm_image_url"]) 45 | if st.session_state["llm_video_url"] is not None: 46 | col_handler.video(st.session_state["llm_video_url"]) 47 | 48 | def display_audio_section(col_handler): 49 | col_handler.subheader("Text-to-audio generation") 50 | col_handler.write(f"Click the buttons to generate audio using Suno Bark") 51 | if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']: 52 | if col_handler.button("Generate audio for user prompt"): 53 | return asyncio.create_task(text_to_audio(st.session_state["text"], "user")) 54 | if col_handler.button("Generate audio for AI prompt"): 55 | return asyncio.create_task(text_to_audio(st.session_state["text_gen_stream_resp"], "llm")) 56 | else: 57 | col_handler.write('Please wait for current job to complete.') 58 | return None 59 | 60 | def display_image_section(col_handler): 61 | col_handler.subheader("Text-to-image generation") 62 | col_handler.write( 63 | "Click the buttons to generate an image using {}".format(st.session_state.image_model) 64 | ) 65 | negative_prompt = col_handler.text_area("Negative prompt", st.session_state['negative_prompt']) 66 | st.session_state['negative_prompt'] = negative_prompt 67 | if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']: 68 | if col_handler.button("Generate image for user prompt"): 69 | return asyncio.create_task(text_to_image(st.session_state["text"], st.session_state['negative_prompt'], src="user")) 70 | if col_handler.button("Generate image for AI prompt"): 71 | return asyncio.create_task(text_to_image(st.session_state["text_gen_stream_resp"],st.session_state['negative_prompt'], src="llm")) 72 | else: 73 | col_handler.write('Please wait for current job to complete.') 74 | return None 75 | 76 | def display_video_section(col_handler): 77 | col_handler.subheader("Text-to-video generation") 78 | col_handler.write( 79 | "Click the buttons to generate a video using {}".format(st.session_state.video_model) 80 | ) 81 | if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']: 82 | if col_handler.button("Generate video for user prompt"): 83 | return asyncio.create_task(text_to_video(st.session_state["text"], src="user")) 84 | if col_handler.button("Generate video for AI prompt"): 85 | return asyncio.create_task(text_to_video(st.session_state["text_gen_stream_resp"], src="llm")) 86 | else: 87 | col_handler.write('Please wait for current job to complete.') 88 | return None -------------------------------------------------------------------------------- /multimodal-app/api.py: -------------------------------------------------------------------------------- 1 | import io 2 | import time 3 | import uuid 4 | 5 | import asyncio 6 | import aiohttp 7 | import replicate 8 | import pandas as pd 9 | import streamlit as st 10 | from openai import OpenAI 11 | from groq import Groq 12 | 13 | from constants import * 14 | 15 | def generate_text(text: str, model: str) -> str: 16 | text_gen_response = "" 17 | 18 | if st.session_state.init_model_provider == "groq": 19 | client = Groq() 20 | elif st.session_state.init_model_provider == 'openai': 21 | client = OpenAI() 22 | 23 | t0 = time.time() 24 | completion = client.chat.completions.create( 25 | model=model, 26 | messages=[ 27 | {"role": "system", "content": st.session_state.text_gen_sys_prompt}, 28 | {"role": "user", "content": text}, 29 | ], 30 | stream=True 31 | ) 32 | for chunk in completion: 33 | if chunk.usage is None and chunk.choices[0].delta.content is not None: 34 | print(f"[DEBUG] Regular chunk: {chunk}") 35 | text_gen_response += chunk.choices[0].delta.content 36 | yield chunk.choices[0].delta.content 37 | else: 38 | print(f"[DEBUG] Final chunk: {chunk}") 39 | data = { 40 | "prompt": text, 41 | "system_prompt": st.session_state.text_gen_sys_prompt, 42 | "response": text_gen_response, 43 | "model": model, 44 | "client_time": time.time() - t0, 45 | "date": pd.Timestamp.now() 46 | } 47 | df = pd.DataFrame(data, index=[0]) 48 | st.session_state["text_gen_evals_df"] = pd.concat( 49 | [st.session_state["text_gen_evals_df"], df], ignore_index=True 50 | ) 51 | 52 | 53 | # Access the API key from secrets 54 | REPLICATE_API_KEY = st.secrets["REPLICATE_API_TOKEN"] 55 | client = replicate.Client(api_token=REPLICATE_API_KEY) 56 | 57 | 58 | async def text_to_audio(text: str, src: str) -> bytes: 59 | st.session_state["running_audio_job"] = True 60 | st.session_state[f"{src}_audio_bytes"] = None 61 | 62 | print(f"[DEBUG] Generating audio...") 63 | t0 = time.time() 64 | 65 | # Define the input parameters for the model 66 | input_params = { 67 | "prompt": text, 68 | "text_temp": 0.7, 69 | "output_full": False, 70 | "waveform_temp": 0.7, 71 | "history_prompt": "announcer", 72 | # "duration": 30 # Uncomment if you want to set a specific duration 73 | } 74 | 75 | try: 76 | # Run the model using Replicate API 77 | # Create a Replicate client instance with the API token 78 | 79 | 80 | output = client.run( 81 | "x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e", 82 | input={ 83 | "gen_text": text, 84 | "ref_text": "never underestimate the power of the scout's code", 85 | "ref_audio": "https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg", 86 | "remove_silence": True, 87 | "custom_split_words": "" 88 | } 89 | ) 90 | 91 | tf = time.time() 92 | print(f"[DEBUG] text_to_audio request took {tf - t0:.2f} seconds") 93 | print(f"[DEBUG] Replicate API output: {output}") 94 | 95 | # Fetch the audio from the returned URL 96 | audio_url = output 97 | async with aiohttp.ClientSession() as session: 98 | async with session.get(audio_url) as audio_response: 99 | audio_bytes = await audio_response.read() 100 | 101 | out_dir = os.path.join(AUDIO_DATA_SINK, src) 102 | if not os.path.exists(out_dir): 103 | os.makedirs(out_dir) 104 | out_path = os.path.join(out_dir, f"{uuid.uuid4().hex}_audio.wav") 105 | 106 | with open(out_path, "wb") as f: 107 | f.write(audio_bytes) 108 | 109 | data = { 110 | "text": text, 111 | "date": pd.Timestamp.now(), 112 | "model": "suno-ai/bark", 113 | "provider": "Replicate", 114 | "client_time": tf - t0, 115 | } 116 | df = pd.DataFrame(data, index=[0]) 117 | st.session_state["audio_gen_evals_df"] = pd.concat( 118 | [st.session_state["audio_gen_evals_df"], df], ignore_index=True 119 | ) 120 | 121 | if src == 'user': 122 | st.session_state["user_audio_bytes"] = audio_bytes 123 | elif src == 'llm': 124 | st.session_state["llm_audio_bytes"] = audio_bytes 125 | 126 | st.session_state["running_audio_job"] = False 127 | 128 | except Exception as e: 129 | st.session_state["running_audio_job"] = False 130 | raise Exception(f"Request failed: {e}") 131 | 132 | async def text_to_image(text: str, negative_prompt: str, src: str = "human") -> str: 133 | st.session_state["running_image_job"] = True 134 | if st.session_state.image_model.startswith('stability-ai/stable-diffusion-3'): 135 | input = { 136 | "seed": 42, 137 | "prompt": text, 138 | "aspect_ratio": "3:2", 139 | "output_quality": 79, 140 | "negative_prompt": negative_prompt, 141 | } 142 | elif st.session_state.image_model.startswith('black-forest-labs/flux-dev'): 143 | input = { 144 | "prompt": text, 145 | "guidance": 3.5, 146 | "num_outputs": 1, 147 | "aspect_ratio": "1:1", 148 | "output_format": "webp", 149 | "output_quality": 80, 150 | "prompt_strength": 0.8 151 | } 152 | else: 153 | raise ValueError(f'Unsupported video model/version type {st.session_state.image_model}.') 154 | 155 | print(f"[DEBUG] Generating image...") 156 | t0 = time.time() 157 | loop = asyncio.get_event_loop() 158 | output = await loop.run_in_executor(None, replicate.run, st.session_state.image_model, input) 159 | tf = time.time() 160 | print(f"[DEBUG] text_to_image request took {tf - t0:.2f} seconds") 161 | 162 | if output and isinstance(output, list) and len(output) > 0: 163 | image_url = output[0] 164 | data = { 165 | "text": text, 166 | "negative_prompt": negative_prompt, 167 | "image_url": image_url, 168 | "date": pd.Timestamp.now(), 169 | "model": st.session_state.image_model, 170 | "provider": "Replicate", 171 | "client_time": tf - t0 172 | } 173 | df = pd.DataFrame(data, index=[0]) 174 | st.session_state["image_gen_evals_df"] = pd.concat( 175 | [st.session_state["image_gen_evals_df"], df], ignore_index=True 176 | ) 177 | if src == 'user': 178 | st.session_state["user_image_url"] = image_url 179 | elif src == 'llm': 180 | st.session_state["llm_image_url"] = image_url 181 | st.session_state["running_image_job"] = False 182 | else: 183 | st.session_state["running_image_job"] = False 184 | raise Exception("Text-to-image model did not return a valid URL.") 185 | 186 | async def text_to_video(text: str, src: str = "user") -> str: 187 | st.session_state.running_video_job = True 188 | if st.session_state.video_model.startswith('lucataco/hotshot-xl'): 189 | input = { 190 | "prompt": text, 191 | "mp4": True 192 | } 193 | elif st.session_state.video_model.startswith('deforum/deforum_stable_diffusion'): 194 | input = { 195 | "animation_prompts": text, 196 | "sampler": "klms", 197 | "max_frames": 100, 198 | } 199 | else: 200 | raise ValueError(f'Unsupported video model/version type {st.session_state.video_model}.') 201 | t0 = time.time() 202 | print("[DEBUG] Generating video...") 203 | loop = asyncio.get_event_loop() 204 | output = await loop.run_in_executor(None, replicate.run, st.session_state.video_model, input) 205 | tf = time.time() 206 | print("[DEBUG] Video generation complete. %s" % output) 207 | if output and isinstance(output, list) and len(output) > 0: 208 | video_url = output[0] 209 | elif output and isinstance(output, str): 210 | video_url = output 211 | data = { 212 | "text": text, 213 | "video_url": video_url, 214 | "date": pd.Timestamp.now(), 215 | "model": st.session_state.video_model, 216 | "provider": "Replicate", 217 | "client_time": tf - t0, 218 | } 219 | df = pd.DataFrame(data, index=[0]) 220 | st.session_state["video_gen_evals_df"] = pd.concat( 221 | [st.session_state["video_gen_evals_df"], df], ignore_index=True 222 | ) 223 | print("[DEBUG] Video URL:", video_url) 224 | if src == 'user': 225 | st.session_state["user_video_url"] = video_url 226 | elif src == 'llm': 227 | st.session_state["llm_video_url"] = video_url 228 | st.session_state["running_video_job"] = False 229 | 230 | def transcribe_audio(audio_data): 231 | try: 232 | if st.session_state.init_model_provider == "groq": 233 | client = Groq() 234 | model_wh = "whisper-large-v3" 235 | print("[DEBUG] Transcribing audio with groq/whisper-1...") 236 | elif st.session_state.init_model_provider == 'openai': 237 | client = OpenAI() 238 | model_wh = "whisper-1" 239 | print("[DEBUG] Transcribing audio with openai/whisper-1...") 240 | file_like = io.BytesIO(audio_data) 241 | file_like.name = "audio.wav" 242 | file_like.seek(0) 243 | 244 | t0 = time.time() 245 | transcription = client.audio.transcriptions.create( 246 | model=model_wh, 247 | file=file_like, 248 | response_format="verbose_json" 249 | ) 250 | tf = time.time() 251 | print(f"[DEBUG] Transcription took {tf - t0:.2f} seconds") 252 | print("Transcription", transcription) 253 | data = { 254 | "transcription": transcription.text, 255 | "duration": transcription.duration, 256 | "date": pd.Timestamp.now(), 257 | "model": "openai/whisper-1", 258 | "provider": "OpenAI", 259 | "client_time": tf - t0, 260 | "language": transcription.language, 261 | } 262 | df = pd.DataFrame(data, index=[0]) 263 | st.session_state["transcription_evals_df"] = pd.concat( 264 | [st.session_state["transcription_evals_df"], df], ignore_index=True 265 | ) 266 | 267 | print("[DEBUG] Transcription type:", type(transcription)) 268 | print("[DEBUG] Transcription:", transcription) 269 | return transcription 270 | 271 | except Exception as e: 272 | print(f"[ERROR] An error occurred during transcription: {e}") 273 | return None -------------------------------------------------------------------------------- /notebooks/models-everywhere.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building Your First Multimodal GenAI App" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Welcome to the Workshop on Building Composable AI Systems 🚀\n", 15 | "\n", 16 | "In this workshop, we're diving into how to create powerful and flexible AI applications using a code-first approach. The world of AI is evolving rapidly, and we now have access to a multitude of models for generating text, images, audio, and more. But to rell leverage the power of these models, we need to think about **composability**: how we can connect and automate AI capabilities in ways that fit our particular needs.\n", 17 | "\n", 18 | "### Why Use Code Interfaces Over Pre-Built Tools?\n", 19 | "\n", 20 | "You might wonder, \"Why not just use simple, web-based tools for these tasks?\" The answer lies in the power of customization, automation, and scalability. Let's explore some real-world scenarios:\n", 21 | "\n", 22 | "---\n", 23 | "\n", 24 | "### Scenario 1: Rapid Image Generation for a Marketing Team 🎨\n", 25 | "Imagine you're part of a marketing team launching a new product. You need to generate dozens of image concepts quickly to test different styles and messages. Using a code-based approach, you can:\n", 26 | "\n", 27 | "- **Prompt** a language model to come up with creative image descriptions.\n", 28 | "- **Generate** images from those descriptions using an image model.\n", 29 | "- **Filter** and select the best images based on specific criteria.\n", 30 | "- **Send** the top images to your team’s Slack or email for feedback.\n", 31 | "\n", 32 | "This automated workflow saves hours of manual work and ensures your team has more creative options to consider.\n", 33 | "\n", 34 | "\n", 35 | "\n", 36 | "### Scenario 2: Creating Background Music for Content Creators 🎶\n", 37 | "You're a content creator working on a podcast or video series. Each episode needs custom background music that matches the theme and mood. With this approach, you can:\n", 38 | "\n", 39 | "- **Generate** multiple music samples using a generative audio model.\n", 40 | "- **Adjust** the style, tempo, or instrumentation on the fly.\n", 41 | "- **Save** and organize the best samples for your project.\n", 42 | "\n", 43 | "This allows you to experiment freely and find the perfect sound without having to rely on pre-made music libraries.\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "### Scenario 3: Interactive Storytelling or Live Events 🎭\n", 48 | "Imagine hosting a live event where you want to engage the audience with real-time storytelling. You can build an app that:\n", 49 | "\n", 50 | "- **Listens** to audience suggestions through voice input.\n", 51 | "- **Generates** narrative twists or visual art based on those suggestions.\n", 52 | "- **Plays** custom soundscapes to match the unfolding story.\n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "### Scenario 4: Team Collaboration & Idea Generation 💡\n", 57 | "Consider a product team brainstorming new features for an app. Instead of manually jotting down ideas and creating prototypes, you can build a pipeline that:\n", 58 | "\n", 59 | "- **Generates** feature ideas using an LLM.\n", 60 | "- **Creates** visual prototypes from those ideas using an image model.\n", 61 | "- **Filters** and prioritizes the best concepts automatically.\n", 62 | "\n", 63 | "This speeds up the ideation process and allows your team to explore more possibilities in less time.\n", 64 | "\n", 65 | "### Workshop Overview 🛠️\n", 66 | "\n", 67 | "In this workshop, you will learn how to:\n", 68 | "- **Compose** AI models to build custom applications.\n", 69 | "- **Automate** workflows using AI-generated content.\n", 70 | "- **Experiment** with different models for text, audio, and images.\n", 71 | "- **Deploy** your own composable AI systems for real-world use cases.\n", 72 | "\n", 73 | "We’ll start with a high-level look at composability, inspired by the Unix philosophy: simple, modular components that can be easily combined. Then, we'll get hands-on, building systems that turn your ideas into reality with minimal effort.\n", 74 | "\n", 75 | "Let’s get started! 🚀" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "First note that these types of multimodal aproaches are already available in apps, such as ChatGPT:" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 51, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "" 96 | ], 97 | "text/plain": [ 98 | "" 99 | ] 100 | }, 101 | "execution_count": 51, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "from IPython.display import Video\n", 108 | "\n", 109 | "Video(\"vid/Dream Advertising Marketing Campaign.mp4\")\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "But this approach ties you to particular models:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "![Alt text](img/multimodal_app_1.png)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "In this notebook, we'll explore a variety of SOTA GenAI models and get a sense of how to stitch them together!" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "![Alt text](img/composability.png)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "![Alt text](img/karpathy-austen.png)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 52, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "" 158 | ], 159 | "text/plain": [ 160 | "" 161 | ] 162 | }, 163 | "execution_count": 52, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "from IPython.display import Video\n", 170 | "\n", 171 | "Video(\"vid/karpathy-austen.mp4\")\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "![Alt text](img/unix-phil.png)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "![Alt text](img/apple-counterpoint.png)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "![Alt text](img/ai-evolution.png)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "![Alt text](img/importance-comp.png)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "![Alt text](img/comp-nec.png)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## Get our API Keys in our environment" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "- Create a [Groq](https://groq.com/) account and navigate [here to get your API key](https://console.groq.com/keys). They have a free tier with a bunch of LLMs (see screenshot below)!\n", 221 | "- If you'd prefer to use OpenAI, you can do that and get [your API key here](https://platform.openai.com/api-keys).\n", 222 | "- To use the models below as is, you'll need a [Replicate account](https://replicate.com/). If you're using this notebook in a workshop, chances are Hugo is able to provision free Replicate credits for you so ask him, if he hasn't mentioned it.\n", 223 | "- Many of these models [you can also find on HuggingFace](https://huggingface.co/models), if you'd prefer." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "![Alt text](img/multimodal_app_2.png)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 53, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Replicate API key captured successfully!\n", 243 | "Groq API key captured successfully!\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "import getpass\n", 249 | "\n", 250 | "\n", 251 | "# Prompt for the Replicate API key\n", 252 | "replicate_api_key = getpass.getpass(\"Please enter your Replicate API key: \")\n", 253 | "print(\"Replicate API key captured successfully!\")\n", 254 | "\n", 255 | "# Prompt for the Grok API key\n", 256 | "groq_api_key = getpass.getpass(\"Please enter your Groq API key: \")\n", 257 | "print(\"Groq API key captured successfully!\")\n", 258 | "\n", 259 | "# # Prompt for the OpenAI API key\n", 260 | "# openai_api_key = getpass.getpass(\"Please enter your OpenAI API key: \")\n", 261 | "# print(\"Replicate OpenAI key captured successfully!\")\n" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "## Suno Bark: text to audio" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "First up, we'll experiment with the [Suno Bark](https://github.com/suno-ai/bark) text to audio model:" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 54, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "https://replicate.delivery/yhqm/wybNlPef3IvDRE4hiChxvdVS0n7snQZo55gs4bfdoXjyg6bnA/output.wav\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "import replicate\n", 293 | "\n", 294 | "# Create a Replicate client instance with the API token\n", 295 | "client = replicate.Client(api_token=replicate_api_key)\n", 296 | "\n", 297 | "output = client.run(\n", 298 | " \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n", 299 | " input={\n", 300 | " \"gen_text\": \"captain hugo, on duty!\",\n", 301 | " \"ref_text\": \"never underestimate the power of the scout's code\",\n", 302 | " \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n", 303 | " \"remove_silence\": True,\n", 304 | " \"custom_split_words\": \"\"\n", 305 | " }\n", 306 | ")\n", 307 | "print(output)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "### LLM output --> Suno bark" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "But what if we want to pipe the output of an LLM into Bark?" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 55, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "from groq import Groq\n", 331 | "\n", 332 | "def get_llm_response(user_input):\n", 333 | " client = Groq(\n", 334 | " api_key=groq_api_key)\n", 335 | "\n", 336 | " response = client.chat.completions.create(\n", 337 | " messages=[\n", 338 | " {\n", 339 | " \"role\": \"user\",\n", 340 | " \"content\": user_input,\n", 341 | " }\n", 342 | " ],\n", 343 | " model=\"llama3-8b-8192\",\n", 344 | " )\n", 345 | "\n", 346 | " return response.choices[0].message.content\n", 347 | "\n", 348 | "# from openai import OpenAI\n", 349 | "# import os\n", 350 | "\n", 351 | "\n", 352 | "# def get_llm_response(user_input):\n", 353 | "# client = OpenAI(api_key=openai_api_key)\n", 354 | " \n", 355 | "# response = client.chat.completions.create(\n", 356 | "# model=\"gpt-3.5-turbo-0613\",\n", 357 | "# messages=[\n", 358 | "# {\"role\": \"user\", \"content\": user_input}\n", 359 | "# ]\n", 360 | "# )\n", 361 | "# return response.choices[0].message.content" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 56, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "Here is a short pirate sea shanty:\n", 374 | "\n", 375 | "(sung to the tune of \"What Shall We Do with a Drunken Sailor\")\n", 376 | "\n", 377 | "Oh, the captain stands on the quarterdeck high\n", 378 | "With his trusty compass and an eye on the sky\n", 379 | "He's searchin' for treasure, plunder and gold\n", 380 | "And a crew to sing along as we sail to old\n", 381 | "\n", 382 | "We'll hoist the Jolly Roger, let it wave in the breeze\n", 383 | "And sing and shout and drink our grog with ease\n", 384 | "We'll dance on the deck and play our sea shanty tune\n", 385 | "For we're pirates bold, on the ocean we've been known\n", 386 | "\n", 387 | "Oh, we'll ride the waves and sing as we go\n", 388 | "On our journey to the Caribbean from the Bosphorus, you know\n", 389 | "We'll drink and we'll jest and we'll tell our yarns\n", 390 | "For we're pirates, and the sea is our home, and we swear no barns!\n", 391 | "\n", 392 | "Yarrr!\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "song = get_llm_response(\"a short pirates sea shanty\")\n", 398 | "print(song)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 57, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | "https://replicate.delivery/yhqm/vx3JskrK62KeDaDSbD3fgCuv04yMDS4XNq0j8fcvA3Slh6bnA/output.wav\n" 411 | ] 412 | } 413 | ], 414 | "source": [ 415 | "output = client.run(\n", 416 | " \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n", 417 | " input={\n", 418 | " \"gen_text\": song,\n", 419 | " \"ref_text\": \"never underestimate the power of the scout's code\",\n", 420 | " \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n", 421 | " \"remove_silence\": True,\n", 422 | " \"custom_split_words\": \"\"\n", 423 | " }\n", 424 | ")\n", 425 | "print(output)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "## Text to music w/ meta musicgen" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "What if we wanted to create some music with text? Let's try Musicgen from Meta." 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 58, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "https://replicate.delivery/yhqm/MBBfVjxfEgjgc0yXBDICx5xDQIZaD8jI2OHfS5qG0u2zi6bnA/out.mp3\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "input = {\n", 457 | " \"prompt\": \"Horns and Drums. Edo25 major g melodies that sound triumphant and cinematic. Leading up to a crescendo that resolves in a 9th harmonic\",\n", 458 | " \"model_version\": \"stereo-large\",\n", 459 | " \"output_format\": \"mp3\",\n", 460 | " \"normalization_strategy\": \"peak\"\n", 461 | "}\n", 462 | "\n", 463 | "output = client.run(\n", 464 | " \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n", 465 | " input=input\n", 466 | ")\n", 467 | "print(output)\n", 468 | "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 59, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "https://replicate.delivery/yhqm/xvXTf1MqtMTXUCmJKeECFHhgjaYqp6svezbHf0bCfon9TqvdC/out.mp3\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "input = {\n", 486 | " \"prompt\": \"Ancient Trip Hop with Throat Singing\",\n", 487 | " \"model_version\": \"stereo-large\",\n", 488 | " \"output_format\": \"mp3\",\n", 489 | " \"normalization_strategy\": \"peak\",\n", 490 | " \"duration\": 30 \n", 491 | "}\n", 492 | "\n", 493 | "output = client.run(\n", 494 | " \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n", 495 | " input=input\n", 496 | ")\n", 497 | "print(output)\n", 498 | "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..." 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "## Text to music with riffusion" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "There are lots of other models to experiment with, such as riffusion:" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 60, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "{'audio': 'https://replicate.delivery/czjl/KJn0JSKssfyflUaYNfHveLeeVEXwYQJc1edZGntKNhfeXl6bnA/gen_sound.wav', 'spectrogram': 'https://replicate.delivery/czjl/ykuCE1ZPVRL3JtnwGf3MoBW5e461pAvhDfh1uWUDEt3Xl6bnA/spectrogram.jpg'}\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "output = client.run(\n", 530 | " \"riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05\",\n", 531 | " input={\n", 532 | " \"alpha\": 0.5,\n", 533 | " \"prompt_a\": \"West African Desert Blues\",\n", 534 | " \"prompt_b\": \"Throat Singing\",\n", 535 | " \"denoising\": 0.75,\n", 536 | " \"seed_image_id\": \"vibes\",\n", 537 | " \"num_inference_steps\": 50\n", 538 | " }\n", 539 | ")\n", 540 | "print(output)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "___" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "## Experiment: One prompt to many models" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "Now what if we wanted to use a single prompt to create text, audio, images, and video?" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 61, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "message = \"The Waffle House is really messing up the pancakes and bacon tonight HOLY MOLEY and there's anarchist jazz also!\"" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": {}, 576 | "source": [ 577 | "### text to image" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 62, 583 | "metadata": {}, 584 | "outputs": [ 585 | { 586 | "name": "stdout", 587 | "output_type": "stream", 588 | "text": [ 589 | "['https://replicate.delivery/pbxt/LfupBRuofqjyp08abH0BgfbeJIWdqI323XqVyc8WUmxiN13OB/R8__00001_.webp']\n" 590 | ] 591 | } 592 | ], 593 | "source": [ 594 | "input = {\n", 595 | " \"prompt\": message\n", 596 | "}\n", 597 | "\n", 598 | "output = client.run(\n", 599 | " \"fofr/epicrealismxl-lightning-hades:0ca10b1fd361c1c5568720736411eaa89d9684415eb61fd36875b4d3c20f605a\",\n", 600 | " input=input\n", 601 | ")\n", 602 | "print(output)\n", 603 | "#=> [\"https://replicate.delivery/pbxt/ulYZRIyAUDYpOZfl7OjhrKx..." 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "### text to audio" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 63, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "data": { 620 | "text/plain": [ 621 | "\"The Waffle House is really messing up the pancakes and bacon tonight HOLY MOLEY and there's anarchist jazz also!\"" 622 | ] 623 | }, 624 | "execution_count": 63, 625 | "metadata": {}, 626 | "output_type": "execute_result" 627 | } 628 | ], 629 | "source": [ 630 | "message" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 64, 636 | "metadata": {}, 637 | "outputs": [ 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "https://replicate.delivery/yhqm/MOCU8DxsSmqzHZzoVffRremzP4Fnabl75tHc4nkcuUS0o6bnA/output.wav\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "output = client.run(\n", 648 | " \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n", 649 | " input={\n", 650 | " \"gen_text\": message,\n", 651 | " \"ref_text\": \"never underestimate the power of the scout's code\",\n", 652 | " \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n", 653 | " \"remove_silence\": True,\n", 654 | " \"custom_split_words\": \"\"\n", 655 | " }\n", 656 | ")\n", 657 | "print(output)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "### text to music" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 65, 670 | "metadata": {}, 671 | "outputs": [ 672 | { 673 | "name": "stdout", 674 | "output_type": "stream", 675 | "text": [ 676 | "https://replicate.delivery/yhqm/fKPRqJ13XEy3PaiemzVftJhavz5bydhOLjvMjQzoHvrCr6bnA/out.mp3\n" 677 | ] 678 | } 679 | ], 680 | "source": [ 681 | "input = {\n", 682 | " \"prompt\": message,\n", 683 | " \"model_version\": \"stereo-large\",\n", 684 | " \"output_format\": \"mp3\",\n", 685 | " \"normalization_strategy\": \"peak\",\n", 686 | " \"duration\": 30 \n", 687 | "}\n", 688 | "\n", 689 | "output = client.run(\n", 690 | " \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n", 691 | " input=input\n", 692 | ")\n", 693 | "print(output)\n", 694 | "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..." 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "## Many models at once" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "metadata": {}, 707 | "source": [ 708 | "Let's write some utility functions that use these models:" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 66, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "def generate_epic_realism(prompt, api_token):\n", 718 | " # Create a Replicate client instance with the API token\n", 719 | " client = replicate.Client(api_token=replicate_api_key)\n", 720 | "\n", 721 | " # Define the input parameters for the model\n", 722 | " input_data = {\n", 723 | " \"prompt\": prompt\n", 724 | " }\n", 725 | "\n", 726 | " # Run the model using Replicate API\n", 727 | " output = client.run(\n", 728 | " \"fofr/epicrealismxl-lightning-hades:0ca10b1fd361c1c5568720736411eaa89d9684415eb61fd36875b4d3c20f605a\",\n", 729 | " input=input_data\n", 730 | " )\n", 731 | " \n", 732 | " return output\n", 733 | "\n", 734 | "\n", 735 | "\n", 736 | "# def generate_suno_bark(prompt, api_token, text_temp=0.7, output_full=False, waveform_temp=0.7, history_prompt=\"announcer\"):\n", 737 | "# # Create a Replicate client instance with the API token\n", 738 | "# client = replicate.Client(api_token=replicate_api_key)\n", 739 | "\n", 740 | "# # Define the input parameters for the model\n", 741 | "# input_params = {\n", 742 | "# \"prompt\": prompt,\n", 743 | "# \"text_temp\": text_temp,\n", 744 | "# \"output_full\": output_full,\n", 745 | "# \"waveform_temp\": waveform_temp,\n", 746 | "# \"history_prompt\": \"zh_speaker_7\",\n", 747 | "# }\n", 748 | "\n", 749 | "# # Run the model using Replicate API\n", 750 | "# try:\n", 751 | "# output = client.run(\n", 752 | "# \"suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787\",\n", 753 | "# input=input_params\n", 754 | "# )\n", 755 | "# return output\n", 756 | "# except Exception as e:\n", 757 | "# print(f\"Error: {e}\")\n", 758 | "# return None\n", 759 | "\n", 760 | "\n", 761 | "\n", 762 | "\n", 763 | "def generate_music_gen(prompt, api_token, duration=30, model_version=\"stereo-large\", output_format=\"mp3\", normalization_strategy=\"peak\"):\n", 764 | " # Create a Replicate client instance with the API token\n", 765 | " client = replicate.Client(api_token=replicate_api_key)\n", 766 | "\n", 767 | " # Define the input parameters for the model\n", 768 | " input_data = {\n", 769 | " \"prompt\": prompt,\n", 770 | " \"model_version\": model_version,\n", 771 | " \"output_format\": output_format,\n", 772 | " \"normalization_strategy\": normalization_strategy,\n", 773 | " \"duration\": duration \n", 774 | " }\n", 775 | "\n", 776 | " # Run the model using Replicate API\n", 777 | " output = client.run(\n", 778 | " \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n", 779 | " input=input_data\n", 780 | " )\n", 781 | " \n", 782 | " return output\n", 783 | "\n", 784 | "\n", 785 | "def generate_suno_bark(prompt, api_token, text_temp=0.7, output_full=False, waveform_temp=0.7, history_prompt=\"announcer\"):\n", 786 | " # Create a Replicate client instance with the API token\n", 787 | " client = replicate.Client(api_token=replicate_api_key)\n", 788 | "\n", 789 | " # Define the input parameters for the model\n", 790 | " input_params = {\n", 791 | " \"prompt\": prompt,\n", 792 | " \"text_temp\": text_temp,\n", 793 | " \"output_full\": output_full,\n", 794 | " \"waveform_temp\": waveform_temp,\n", 795 | " \"history_prompt\": \"announcer\",\n", 796 | " }\n", 797 | "\n", 798 | " # Run the model using Replicate API\n", 799 | " try:\n", 800 | " output = client.run(\n", 801 | " \"suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787\",\n", 802 | " input=input_params\n", 803 | " )\n", 804 | " return output\n", 805 | " except Exception as e:\n", 806 | " print(f\"Error: {e}\")\n", 807 | " return None\n", 808 | "\n", 809 | "\n", 810 | "def generate_f5(prompt, api_token):\n", 811 | " # Create a Replicate client instance with the API token\n", 812 | " client = replicate.Client(api_token=replicate_api_key)\n", 813 | " output = client.run(\n", 814 | " \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n", 815 | " input={\n", 816 | " \"gen_text\": prompt,\n", 817 | " \"ref_text\": \"never underestimate the power of the scout's code\",\n", 818 | " \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n", 819 | " \"remove_silence\": True,\n", 820 | " \"custom_split_words\": \"\"\n", 821 | " }\n", 822 | " )\n", 823 | " return output\n", 824 | "\n", 825 | "\n" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "Let's test them out:" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": 67, 838 | "metadata": {}, 839 | "outputs": [ 840 | { 841 | "name": "stdout", 842 | "output_type": "stream", 843 | "text": [ 844 | "['https://replicate.delivery/pbxt/iPF40CjW9CIvNBhKCzO6uyfcU3HjjyDenf6d2iTWen4Jb13OB/R8__00001_.webp']\n", 845 | "https://replicate.delivery/yhqm/DeYnpvdxtkWpZ6xNIfXzfesg5WWJbiqviFS3HgOomKuzi13OB/output.wav\n", 846 | "https://replicate.delivery/yhqm/2QbO5lBMp3LlE1d3hBuF9EIYf4ETGW43EDI3i2wLg955setTA/out.mp3\n" 847 | ] 848 | } 849 | ], 850 | "source": [ 851 | "message = \"crazy wild zombie party at the blaring symphony orchestra\"\n", 852 | "output = generate_epic_realism(message, replicate_api_key)\n", 853 | "print(output)\n", 854 | "\n", 855 | "output = generate_f5(message, replicate_api_key)\n", 856 | "print(output)\n", 857 | "\n", 858 | "output = generate_music_gen(message, replicate_api_key)\n", 859 | "print(output)\n" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 68, 865 | "metadata": {}, 866 | "outputs": [ 867 | { 868 | "name": "stdout", 869 | "output_type": "stream", 870 | "text": [ 871 | "Epic Realism Output:\n", 872 | "['https://replicate.delivery/pbxt/i04ppXTefyl3m05U5nizQkY8L1qE0wc3JIe5OyOTDztu26bnA/R8__00001_.webp']\n", 873 | "Meta MusicGen Output:\n", 874 | "https://replicate.delivery/yhqm/jOIw2FwqmDIROFcBsPgjKTiovMCeWXWUNCS975lG6MDUuetTA/out.mp3\n", 875 | "Suno Bark Output:\n", 876 | "https://replicate.delivery/yhqm/0s8HPb8LgmImGtIubO1qtlG5UVchfi0SHa7lAeiweE4X76bnA/output.wav\n" 877 | ] 878 | } 879 | ], 880 | "source": [ 881 | "# Define your API token and prompt message\n", 882 | "# api_token = 'your_api_token_here'\n", 883 | "message = \"The Waffle House messing it up for real with the pancakes and bacon and punk abstract jazz, yo!\"\n", 884 | "\n", 885 | "# Run the Epic Realism model\n", 886 | "epicrealism_output = generate_epic_realism(message, replicate_api_key)\n", 887 | "print(\"Epic Realism Output:\")\n", 888 | "print(epicrealism_output)\n", 889 | "\n", 890 | "# Run the Meta MusicGen model\n", 891 | "musicgen_output = generate_music_gen(message, replicate_api_key)\n", 892 | "print(\"Meta MusicGen Output:\")\n", 893 | "print(musicgen_output)\n", 894 | "\n", 895 | "# Run the Suno Bark model\n", 896 | "bark_output = generate_f5(message, replicate_api_key)\n", 897 | "print(\"Suno Bark Output:\")\n", 898 | "print(bark_output)\n", 899 | "\n" 900 | ] 901 | }, 902 | { 903 | "cell_type": "markdown", 904 | "metadata": {}, 905 | "source": [ 906 | "### Experiment: text to video" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 69, 912 | "metadata": {}, 913 | "outputs": [ 914 | { 915 | "name": "stdout", 916 | "output_type": "stream", 917 | "text": [ 918 | "https://replicate.delivery/yhqm/hOb98yie1egiX0nrtfm7pRIxjHGOKFDAtNykfdExfse6HYf2JA/out.mp4\n" 919 | ] 920 | } 921 | ], 922 | "source": [ 923 | "message = \"The Waffle House messing it up for real with the pancakes and bacon and punk abstract jazz, yo!\"\n", 924 | "\n", 925 | "input = {\n", 926 | " \"sampler\": \"klms\",\n", 927 | " \"max_frames\": 100,\n", 928 | " \"animation_prompts\": message\n", 929 | "}\n", 930 | "\n", 931 | "output = client.run(\n", 932 | " \"deforum/deforum_stable_diffusion:e22e77495f2fb83c34d5fae2ad8ab63c0a87b6b573b6208e1535b23b89ea66d6\",\n", 933 | " input=input\n", 934 | ")\n", 935 | "print(output)\n", 936 | "#=> \"https://replicate.delivery/mgxm/873a1cc7-0427-4e8d-ab3c-..." 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": {}, 942 | "source": [ 943 | "![Alt text](img/today.png)" 944 | ] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": {}, 949 | "source": [ 950 | "![Alt text](img/explore.png)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "markdown", 955 | "metadata": {}, 956 | "source": [] 957 | }, 958 | { 959 | "cell_type": "markdown", 960 | "metadata": {}, 961 | "source": [ 962 | "![Alt text](img/models.png)" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 70, 968 | "metadata": {}, 969 | "outputs": [ 970 | { 971 | "name": "stdout", 972 | "output_type": "stream", 973 | "text": [ 974 | "https://replicate.delivery/yhqm/l21N0GeER0z3KqcMwfxznap2QKF0oBD3mjVzLxIgDUUjh9tTA/output.wav\n" 975 | ] 976 | } 977 | ], 978 | "source": [ 979 | "# Create a Replicate client instance with the API token\n", 980 | "client = replicate.Client(api_token=replicate_api_key)\n", 981 | "\n", 982 | "output = client.run(\n", 983 | " \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n", 984 | " input={\n", 985 | " \"gen_text\": \"captain hugo, on duty!\",\n", 986 | " \"ref_text\": \"never underestimate the power of the scout's code\",\n", 987 | " \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n", 988 | " \"remove_silence\": True,\n", 989 | " \"custom_split_words\": \"\"\n", 990 | " }\n", 991 | ")\n", 992 | "print(output)" 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": null, 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [] 1001 | } 1002 | ], 1003 | "metadata": { 1004 | "kernelspec": { 1005 | "display_name": "experiments-in-ai-UifnU4Ym-py3.11", 1006 | "language": "python", 1007 | "name": "python3" 1008 | }, 1009 | "language_info": { 1010 | "codemirror_mode": { 1011 | "name": "ipython", 1012 | "version": 3 1013 | }, 1014 | "file_extension": ".py", 1015 | "mimetype": "text/x-python", 1016 | "name": "python", 1017 | "nbconvert_exporter": "python", 1018 | "pygments_lexer": "ipython3", 1019 | "version": "3.11.9" 1020 | } 1021 | }, 1022 | "nbformat": 4, 1023 | "nbformat_minor": 2 1024 | } 1025 | --------------------------------------------------------------------------------