├── notebooks
    ├── img
    │   ├── models.png
    │   ├── today.png
    │   ├── comp-nec.png
    │   ├── explore.png
    │   ├── unix-phil.png
    │   ├── ai-evolution.png
    │   ├── composability.png
    │   ├── importance-comp.png
    │   ├── karpathy-austen.png
    │   ├── multimodal_app_1.png
    │   ├── multimodal_app_2.png
    │   └── apple-counterpoint.png
    ├── vid
    │   ├── karpathy-austen.mp4
    │   └── Dream Advertising Marketing Campaign.mp4
    └── models-everywhere.ipynb
├── multimodal-app
    ├── .streamlit
    │   └── secrets.toml
    ├── README.md
    ├── pages
    │   ├── 00_Text_Generation_Evals.py
    │   ├── 01_Audio_Generation_Evals.py
    │   ├── 02_Image_Generation_Evals.py
    │   ├── 03_Video_Generation_Evals.py
    │   └── 04_Transcription_Evals.py
    ├── constants.py
    ├── utils.py
    ├── main.py
    ├── ui.py
    └── api.py
├── pyproject.toml
├── LICENSE
├── .gitignore
└── README.md


/notebooks/img/models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/models.png


--------------------------------------------------------------------------------
/notebooks/img/today.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/today.png


--------------------------------------------------------------------------------
/notebooks/img/comp-nec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/comp-nec.png


--------------------------------------------------------------------------------
/notebooks/img/explore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/explore.png


--------------------------------------------------------------------------------
/notebooks/img/unix-phil.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/unix-phil.png


--------------------------------------------------------------------------------
/notebooks/img/ai-evolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/ai-evolution.png


--------------------------------------------------------------------------------
/notebooks/img/composability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/composability.png


--------------------------------------------------------------------------------
/multimodal-app/.streamlit/secrets.toml:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY = 'XXX'
2 | OPENAI_API_KEY = 'XXX'
3 | REPLICATE_API_TOKEN = 'XXX'
4 | HF_API_KEY = 'XXX'


--------------------------------------------------------------------------------
/notebooks/img/importance-comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/importance-comp.png


--------------------------------------------------------------------------------
/notebooks/img/karpathy-austen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/karpathy-austen.png


--------------------------------------------------------------------------------
/notebooks/img/multimodal_app_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/multimodal_app_1.png


--------------------------------------------------------------------------------
/notebooks/img/multimodal_app_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/multimodal_app_2.png


--------------------------------------------------------------------------------
/notebooks/vid/karpathy-austen.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/vid/karpathy-austen.mp4


--------------------------------------------------------------------------------
/notebooks/img/apple-counterpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/img/apple-counterpoint.png


--------------------------------------------------------------------------------
/multimodal-app/README.md:
--------------------------------------------------------------------------------
1 | Before install Python dependencies, you need to install `portaudio` library.
2 | 
3 | ```bash
4 | brew install portaudio
5 | ```


--------------------------------------------------------------------------------
/multimodal-app/pages/00_Text_Generation_Evals.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | st.markdown("# Text Generation Evals")
4 | st.dataframe(st.session_state["text_gen_evals_df"])
5 | 


--------------------------------------------------------------------------------
/multimodal-app/pages/01_Audio_Generation_Evals.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | st.markdown("# Audio Generation Evals")
4 | st.dataframe(st.session_state["audio_gen_evals_df"])
5 | 


--------------------------------------------------------------------------------
/multimodal-app/pages/02_Image_Generation_Evals.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | st.markdown("# Image Generation Evals")
4 | st.dataframe(st.session_state["image_gen_evals_df"])
5 | 


--------------------------------------------------------------------------------
/multimodal-app/pages/03_Video_Generation_Evals.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | st.markdown("# Video Generation Evals")
4 | st.dataframe(st.session_state["video_gen_evals_df"])
5 | 


--------------------------------------------------------------------------------
/multimodal-app/pages/04_Transcription_Evals.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | st.markdown("# Transcription Evals")
4 | st.dataframe(st.session_state["transcription_evals_df"])
5 | 


--------------------------------------------------------------------------------
/notebooks/vid/Dream Advertising Marketing Campaign.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pietroppeter/first-multimodal-genAI-app/main/notebooks/vid/Dream Advertising Marketing Campaign.mp4


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "experiments-in-ai"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["hugobowne <hugobowne@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | jupyter = "^1.0.0"
11 | pandas = "^2.2.2"
12 | matplotlib = "^3.9.0"
13 | seaborn = "^0.13.2"
14 | transformers = "^4.41.2"
15 | torch = "^2.3.1"
16 | scikit-learn = "^1.5.0"
17 | datasets = "^2.19.2"
18 | diffusers = "^0.28.2"
19 | openai = "^1.33.0"
20 | pip = "^24.0"
21 | ai21 = "^2.4.2"
22 | replicate = "^0.26.0"
23 | streamlit = "^1.35.0"
24 | huggingface-hub = "^0.23.3"
25 | audio-recorder-streamlit = "^0.0.10"
26 | scipy = "^1.14.0"
27 | groq = "^0.9.0"
28 | 
29 | 
30 | [build-system]
31 | requires = ["poetry-core"]
32 | build-backend = "poetry.core.masonry.api"
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Hugo Bowne-Anderson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/multimodal-app/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | DEFAULT_TEXT_GEN_SYSTEM_PROMPT = (
 4 |     "You are a master storyteller, songwriter, and creator in a world where words shape reality. Your purpose is to generate responses that are imaginative, vivid, and captivating. Whether the user provides a simple prompt, a detailed scenario, or a fantastical idea, you will craft a response that brings their song to life in an entertaining and engaging way. Be creative, be descriptive, and always aim to surprise and delight with your short and rhythmic responses. Write a four line poem based on the user prompt, use adlibs, and make it fun and full of ♪ symbols to help downstream models know you are singing!"
 5 | )
 6 | DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT = "Sad, dark, and gloomy image."
 7 | 
 8 | # Set up API URLs and headers
 9 | # HF_BARK_ENDPOINT = "https://api-inference.huggingface.co/models/suno/bark"
10 | # bark_api_headers = {"Authorization": f"Bearer {os.environ['HF_API_KEY']}"}
11 | 
12 | REPLICATE_IMAGE_MODEL_ID_LS = [
13 |      "black-forest-labs/flux-dev",
14 |     "stability-ai/stable-diffusion-3",
15 | ]
16 | REPLICATE_VIDEO_MODEL_ID_LS = [
17 |     "lucataco/hotshot-xl:78b3a6257e16e4b241245d65c8b2b81ea2e1ff7ed4c55306b511509ddbfd327a",
18 |     "deforum/deforum_stable_diffusion:e22e77495f2fb83c34d5fae2ad8ab63c0a87b6b573b6208e1535b23b89ea66d6",
19 | ]
20 | 
21 | # Sinks
22 | AUDIO_DATA_SINK = os.path.join(os.path.dirname(__file__), "audio")


--------------------------------------------------------------------------------
/multimodal-app/utils.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | from constants import DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT
 4 | 
 5 | def init_session_state():
 6 |     if "text" not in st.session_state:
 7 |         st.session_state["text"] = None
 8 |     if "text_gen_stream_resp" not in st.session_state:
 9 |         st.session_state["text_gen_stream_resp"] = None
10 |     if "text_gen_evals_df" not in st.session_state:
11 |         st.session_state["text_gen_evals_df"] = pd.DataFrame()
12 |     if "user_audio_bytes" not in st.session_state:
13 |         st.session_state["user_audio_bytes"] = None
14 |     if "llm_audio_bytes" not in st.session_state:
15 |         st.session_state["llm_audio_bytes"] = None
16 |     if "audio_gen_evals_df" not in st.session_state:
17 |         st.session_state["audio_gen_evals_df"] = pd.DataFrame()
18 |     if "user_image_url" not in st.session_state:
19 |         st.session_state["user_image_url"] = None
20 |     if "llm_image_url" not in st.session_state:
21 |         st.session_state["llm_image_url"] = None
22 |     if "image_gen_evals_df" not in st.session_state:
23 |         st.session_state["image_gen_evals_df"] = pd.DataFrame()
24 |     if 'negative_prompt' not in st.session_state:
25 |         st.session_state['negative_prompt'] = DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT
26 |     if "user_video_url" not in st.session_state:
27 |         st.session_state["user_video_url"] = None
28 |     if "llm_video_url" not in st.session_state:
29 |         st.session_state["llm_video_url"] = None
30 |     if "video_gen_evals_df" not in st.session_state:
31 |         st.session_state["video_gen_evals_df"] = pd.DataFrame()
32 |     if "transcription_evals_df" not in st.session_state:
33 |         st.session_state["transcription_evals_df"] = pd.DataFrame()
34 |     if "tasks" not in st.session_state:
35 |         st.session_state['tasks'] = []
36 |     st.session_state["running_text_job"] = False
37 |     st.session_state["running_audio_job"] = False
38 |     st.session_state["running_image_job"] = False
39 |     st.session_state["running_video_job"] = False
40 | 
41 | def show_quick_reset_option(col_handler):
42 |     if col_handler.button("Reset session data"):
43 |         st.session_state["text"] = None
44 |         st.session_state["text_gen_stream_resp"] = None
45 |         st.session_state["user_audio_bytes"] = None
46 |         st.session_state["llm_audio_bytes"] = None
47 |         st.session_state["user_image_url"] = None
48 |         st.session_state["llm_image_url"] = None
49 |         st.session_state['negative_prompt'] = DEFAULT_IMAGE_GEN_NEGATIVE_PROMPT
50 |         st.session_state["user_video_url"] = None
51 |         st.session_state["llm_video_url"] = None
52 |         st.session_state["running_text_job"] = False
53 |         st.session_state["running_audio_job"] = False
54 |         st.session_state["running_image_job"] = False
55 |         st.session_state["running_video_job"] = False
56 |         st.rerun()
57 | 
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/multimodal-app/main.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import os
 3 | import asyncio
 4 | import streamlit as st
 5 | 
 6 | from constants import *
 7 | from ui import (
 8 |     set_users_initial_prompt,
 9 |     display_user_info,
10 |     display_llm_info,
11 |     display_audio_section,
12 |     display_image_section,
13 |     display_video_section,
14 | )
15 | from api import (
16 |     generate_text, 
17 |     text_to_audio,
18 |     text_to_image,
19 |     text_to_video
20 | )
21 | from utils import init_session_state, show_quick_reset_option
22 | 
23 | st.set_page_config(page_title="Audio Chat", page_icon="🎤", layout="wide")
24 | 
25 | async def background_tasks(placeholder):
26 |     while True:
27 |         _n_complete = sum([t.done() for t in st.session_state.tasks])
28 |         with placeholder:
29 |             st.write(f'Completed `{_n_complete}` of `{len(st.session_state.tasks)}` generations. 🚨 Starting other tasks will erase this queue 🚨')
30 |         if _n_complete != len(st.session_state.tasks):
31 |             await asyncio.sleep(1.2)
32 |         else:
33 |             st.rerun()
34 |             break
35 | 
36 | async def main():
37 | 
38 |     if 'text' not in st.session_state:
39 |         init_session_state()
40 | 
41 |     left_column, right_column = st.columns(2)
42 |     left_column.title("User input")
43 |     right_column.title("AI generations")
44 |     placeholder = right_column.empty()
45 | 
46 |     if st.session_state["text"] is None:
47 |         left_column.subheader("Text generation")
48 |         openai_model_ls = ["openai:gpt-3.5-turbo", "openai:gpt-4o-mini", "openai:gpt-4o"]
49 |         groq_model_ls = ["groq:llama-3.1-70b-versatile", "groq:mixtral-8x7b-32768", "groq:gemma2-9b-it"]
50 |         model_choice = left_column.selectbox("Initial text reply model", groq_model_ls + openai_model_ls)
51 |         model_choice_ls = model_choice.split(':')
52 |         st.session_state.init_model_provider, st.session_state.init_model = model_choice_ls[0], model_choice_ls[1]
53 |         st.session_state.text_gen_sys_prompt = left_column.text_area("System prompt", DEFAULT_TEXT_GEN_SYSTEM_PROMPT)
54 |         set_users_initial_prompt(left_column)
55 |     else:
56 |         show_quick_reset_option(left_column)
57 |         left_column.subheader("Pick your poison")
58 |         st.session_state.image_model = left_column.selectbox("Image model", REPLICATE_IMAGE_MODEL_ID_LS)
59 |         st.session_state.video_model = left_column.selectbox("Video model", REPLICATE_VIDEO_MODEL_ID_LS)
60 | 
61 |     _generate_text = partial(generate_text, st.session_state["text"], st.session_state.init_model)
62 | 
63 |     if st.session_state["text"]:
64 |         display_user_info(right_column)
65 |         display_llm_info(right_column, _generate_text)
66 |         if left_column.button('Run all tasks concurrently'):
67 |             st.session_state.tasks = [
68 |                 asyncio.create_task(text_to_image(st.session_state["text"], st.session_state['negative_prompt'], src="user")),
69 |                 asyncio.create_task(text_to_image(st.session_state["text_gen_stream_resp"],st.session_state['negative_prompt'], src="llm")),
70 |                 asyncio.create_task(text_to_video(st.session_state["text"], src="user")),
71 |                 asyncio.create_task(text_to_video(st.session_state["text_gen_stream_resp"], src="llm")),
72 |                 asyncio.create_task(text_to_audio(st.session_state["text"], "user")),
73 |                 asyncio.create_task(text_to_audio(st.session_state["text_gen_stream_resp"], "llm")),
74 |             ]
75 |             await background_tasks(placeholder)
76 |             st.rerun()
77 |         else:
78 |             audio_task = display_audio_section(left_column)
79 |             image_task = display_image_section(left_column)
80 |             video_task = display_video_section(left_column)
81 |             st.session_state.tasks = list(filter(lambda x: x is not None, [audio_task, image_task, video_task]))
82 |             if st.session_state.tasks:
83 |                 await background_tasks(placeholder)
84 |                 st.rerun()
85 | 
86 | asyncio.run(main())


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Building Your First Multimodal Gen AI App 🚀
  2 | 
  3 | ## Introduction
  4 | 
  5 | Welcome to the tutorial on building your first multimodal generative AI (Gen AI) app! This repository contains all the resources and code you need to get started with creating an app that can generate text, audio, images, and videos using various AI models and APIs.
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | Before you begin, make sure you have the following:
 10 | 
 11 | - A GitHub account
 12 | - GitHub Codespaces enabled (comes with your GitHub account)
 13 | - API keys for the following:
 14 |   - [Groq](https://groq.com/) or [OpenAI](https://platform.openai.com/playground) (at least one is required; Groq has a free tier for all the models we need!)
 15 |   - [Replicate](https://replicate.com/) (necessary for full functionality; Replicate has kindly provided credits for those taking this workshop at a conference.)
 16 | - Basic knowledge of Python and Bash
 17 | 
 18 | **Note about GitHub Codespaces:** 
 19 | - GitHub Codespaces is included with every GitHub account.
 20 | - There's a substantial monthly free tier for personal accounts (120 core hours/month as of 2024).
 21 | - If you exceed the free tier, you may need to purchase additional usage.
 22 | - For the latest information on GitHub Codespaces pricing and usage limits, please check the [official GitHub documentation](https://docs.github.com/en/billing/managing-billing-for-github-codespaces/about-billing-for-github-codespaces).
 23 | 
 24 | **Note for Workshop Participants:** If you are taking this workshop at a conference or other event, please check with your instructors or teachers to see if they are providing the API keys for you.
 25 | 
 26 | ## Setting Up the Environment
 27 | 
 28 | To get up and running, you can watch the video below and/or follow the instructions (instructions at around 1:50 after motivating demo):
 29 | 
 30 | 
 31 | 
 32 | https://github.com/user-attachments/assets/342b5ee3-c067-4407-9d51-96fab6488863
 33 | 
 34 | 
 35 | 
 36 | 
 37 | ### Creating a GitHub Codespace
 38 | 
 39 | 1. Open the repository in GitHub.
 40 | 2. Click on the `Code` button and select `Create codespace on main`.
 41 | 3. Wait for the Codespace to spin up (this should take about 2 minutes).
 42 | 
 43 | ### Adding API Keys
 44 | 
 45 | 1. In the Codespace, navigate to the `.streamlit` directory inside the `multimodal_app` folder.
 46 | 2. Open the `secrets.toml` file.
 47 | 3. Add your API keys as follows:
 48 |     ```toml
 49 |     OPENAI_API_KEY = "your_openai_api_key"
 50 |     GROQ_API_KEY = "your_groq_api_key"
 51 |     REPLICATE_API_TOKEN = "your_replicate_api_token"
 52 |     ```
 53 | 4. Save the file and ensure these keys are kept private and secure.
 54 | 
 55 | ### API Keys
 56 | 
 57 | You'll need API keys for either OpenAI or Groq, and Replicate (for full functionality).
 58 | 
 59 | ### Setting Up the Poetry Environment
 60 | 
 61 | 1. Once the Codespace finishes configuring, it will automatically install Poetry.
 62 | 2. In the Codespace terminal, activate the Poetry environment:
 63 |     ```bash
 64 |     cd multimodal_app
 65 |     poetry shell
 66 |     ```
 67 | 
 68 | ## Running the Application
 69 | 
 70 | To run the Streamlit app:
 71 | 
 72 | 1. Ensure you're in the `multimodal_app` directory and have activated the Poetry shell.
 73 | 2. Run the following command:
 74 |     ```bash
 75 |     streamlit run main.py
 76 |     ```
 77 | 3. Click "Open in browser" when prompted to view the app.
 78 | 
 79 | ## Using the Application
 80 | 
 81 | The multimodal Gen AI app allows you to:
 82 | 
 83 | 1. Record speech or type text input.
 84 | 2. Transcribe speech to text.
 85 | 3. Generate text responses based on your input.
 86 | 4. Create audio versions of the text.
 87 | 5. Generate images based on the content.
 88 | 6. Create videos incorporating the generated content.
 89 | 
 90 | To use the app:
 91 | 
 92 | 1. Click the record button to speak, or type your input.
 93 | 2. Click "Transcribe" to convert speech to text (if applicable).
 94 | 3. Choose to run all tasks concurrently or step-by-step.
 95 | 4. Explore the generated text, audio, images, and videos.
 96 | 
 97 | ## Troubleshooting
 98 | 
 99 | If you encounter any issues:
100 | 
101 | - Ensure all API keys are correctly entered in the `secrets.toml` file.
102 | - Check that you're in the correct directory (`multimodal_app`) when running commands.
103 | - Verify that all dependencies are installed by running `poetry install` if needed.
104 | 
105 | ## Contributing
106 | 
107 | We welcome contributions to improve this project! Please feel free to submit issues or pull requests.
108 | 
109 | ---
110 | 
111 | Happy building! We hope you enjoy creating your first multimodal Gen AI app. If you have any questions or feedback, please don't hesitate to reach out.
112 | 


--------------------------------------------------------------------------------
/multimodal-app/ui.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import streamlit as st
 3 | from audio_recorder_streamlit import audio_recorder
 4 | from api import transcribe_audio, text_to_audio, text_to_image, text_to_video
 5 | from constants import *
 6 | 
 7 | def set_users_initial_prompt(col_handler):
 8 |     text = col_handler.chat_input("Say something")
 9 |     st.session_state["text"] = text
10 |     st.session_state.user_init_audio_bytes = audio_recorder(
11 |         recording_color="#e8b62c",
12 |         neutral_color="#6aa36f",
13 |         icon_name="microphone",
14 |         icon_size="2x",
15 |     )
16 |     if "user_init_audio_bytes" in st.session_state and st.session_state.user_init_audio_bytes is not None:
17 |         col_handler.audio(data=st.session_state["user_init_audio_bytes"], format="audio/wav")
18 |         if col_handler.button("Transcribe"):
19 |             transcription = transcribe_audio(st.session_state["user_init_audio_bytes"])
20 |             st.session_state["text"] = transcription.text
21 | 
22 | def display_user_info(col_handler):
23 |     with col_handler.chat_message("user", avatar="👤"):
24 |         col_handler.write(st.session_state["text"])
25 |         if st.session_state["user_audio_bytes"] is not None:
26 |             col_handler.audio(st.session_state["user_audio_bytes"])
27 |         if st.session_state["user_image_url"] is not None:
28 |             col_handler.image(st.session_state["user_image_url"])
29 |         if st.session_state["user_video_url"] is not None:
30 |             col_handler.video(st.session_state["user_video_url"])
31 | 
32 | def display_llm_info(col_handler, f):
33 |     with col_handler.chat_message("ai", avatar="🤖"):
34 |         if st.session_state["text_gen_stream_resp"] is None:
35 |             st.session_state["text_gen_stream_resp"] = col_handler.write_stream(
36 |                 f
37 |             )
38 |             st.rerun()
39 |         else:
40 |             col_handler.write(st.session_state["text_gen_stream_resp"])
41 |         if st.session_state["llm_audio_bytes"] is not None:
42 |             col_handler.audio(st.session_state["llm_audio_bytes"])
43 |         if st.session_state["llm_image_url"] is not None:
44 |             col_handler.image(st.session_state["llm_image_url"])
45 |         if st.session_state["llm_video_url"] is not None:
46 |             col_handler.video(st.session_state["llm_video_url"])
47 | 
48 | def display_audio_section(col_handler):
49 |     col_handler.subheader("Text-to-audio generation")
50 |     col_handler.write(f"Click the buttons to generate audio using Suno Bark")
51 |     if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']:
52 |         if col_handler.button("Generate audio for user prompt"):
53 |             return asyncio.create_task(text_to_audio(st.session_state["text"], "user"))
54 |         if col_handler.button("Generate audio for AI prompt"):
55 |             return asyncio.create_task(text_to_audio(st.session_state["text_gen_stream_resp"], "llm"))
56 |     else:
57 |         col_handler.write('Please wait for current job to complete.')
58 |     return None 
59 | 
60 | def display_image_section(col_handler):
61 |     col_handler.subheader("Text-to-image generation")
62 |     col_handler.write(
63 |         "Click the buttons to generate an image using {}".format(st.session_state.image_model)
64 |     )
65 |     negative_prompt = col_handler.text_area("Negative prompt", st.session_state['negative_prompt'])
66 |     st.session_state['negative_prompt'] = negative_prompt
67 |     if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']:
68 |         if col_handler.button("Generate image for user prompt"):
69 |             return asyncio.create_task(text_to_image(st.session_state["text"], st.session_state['negative_prompt'], src="user"))
70 |         if col_handler.button("Generate image for AI prompt"):
71 |             return asyncio.create_task(text_to_image(st.session_state["text_gen_stream_resp"],st.session_state['negative_prompt'], src="llm"))
72 |     else:
73 |         col_handler.write('Please wait for current job to complete.')
74 |     return None 
75 | 
76 | def display_video_section(col_handler):
77 |     col_handler.subheader("Text-to-video generation")
78 |     col_handler.write(
79 |         "Click the buttons to generate a video using {}".format(st.session_state.video_model)
80 |     )
81 |     if not st.session_state['running_audio_job'] or st.session_state['running_image_job'] or st.session_state['running_video_job']:
82 |         if col_handler.button("Generate video for user prompt"):
83 |             return asyncio.create_task(text_to_video(st.session_state["text"], src="user"))
84 |         if col_handler.button("Generate video for AI prompt"):
85 |             return asyncio.create_task(text_to_video(st.session_state["text_gen_stream_resp"], src="llm"))
86 |     else:
87 |         col_handler.write('Please wait for current job to complete.')
88 |     return None 


--------------------------------------------------------------------------------
/multimodal-app/api.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import time
  3 | import uuid
  4 | 
  5 | import asyncio
  6 | import aiohttp
  7 | import replicate
  8 | import pandas as pd
  9 | import streamlit as st
 10 | from openai import OpenAI
 11 | from groq import Groq
 12 | 
 13 | from constants import *
 14 | 
 15 | def generate_text(text: str, model: str) -> str:
 16 |     text_gen_response = ""
 17 | 
 18 |     if st.session_state.init_model_provider == "groq":
 19 |         client = Groq()
 20 |     elif st.session_state.init_model_provider == 'openai':
 21 |         client = OpenAI()
 22 |     
 23 |     t0 = time.time()
 24 |     completion = client.chat.completions.create(
 25 |         model=model,
 26 |         messages=[
 27 |             {"role": "system", "content": st.session_state.text_gen_sys_prompt},
 28 |             {"role": "user", "content": text},
 29 |         ],
 30 |         stream=True
 31 |     )
 32 |     for chunk in completion:
 33 |         if chunk.usage is None and chunk.choices[0].delta.content is not None:
 34 |             print(f"[DEBUG] Regular chunk: {chunk}")
 35 |             text_gen_response += chunk.choices[0].delta.content
 36 |             yield chunk.choices[0].delta.content
 37 |         else:
 38 |             print(f"[DEBUG] Final chunk: {chunk}")
 39 |             data = {
 40 |                 "prompt": text,
 41 |                 "system_prompt": st.session_state.text_gen_sys_prompt,
 42 |                 "response": text_gen_response,
 43 |                 "model": model,
 44 |                 "client_time": time.time() - t0,
 45 |                 "date": pd.Timestamp.now()
 46 |             }
 47 |             df = pd.DataFrame(data, index=[0])
 48 |             st.session_state["text_gen_evals_df"] = pd.concat(
 49 |                 [st.session_state["text_gen_evals_df"], df], ignore_index=True
 50 |             )  
 51 | 
 52 | 
 53 | # Access the API key from secrets
 54 | REPLICATE_API_KEY = st.secrets["REPLICATE_API_TOKEN"]
 55 | client = replicate.Client(api_token=REPLICATE_API_KEY)
 56 | 
 57 | 
 58 | async def text_to_audio(text: str, src: str) -> bytes:
 59 |     st.session_state["running_audio_job"] = True
 60 |     st.session_state[f"{src}_audio_bytes"] = None
 61 | 
 62 |     print(f"[DEBUG] Generating audio...")
 63 |     t0 = time.time()
 64 | 
 65 |     # Define the input parameters for the model
 66 |     input_params = {
 67 |         "prompt": text,
 68 |         "text_temp": 0.7,
 69 |         "output_full": False,
 70 |         "waveform_temp": 0.7,
 71 |         "history_prompt": "announcer",
 72 |         # "duration": 30  # Uncomment if you want to set a specific duration
 73 |     }
 74 | 
 75 |     try:
 76 |         # Run the model using Replicate API
 77 |         # Create a Replicate client instance with the API token
 78 | 
 79 | 
 80 |         output = client.run(
 81 |             "x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e",
 82 |             input={
 83 |                 "gen_text": text,
 84 |                 "ref_text": "never underestimate the power of the scout's code",
 85 |                 "ref_audio": "https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg",
 86 |                 "remove_silence": True,
 87 |                 "custom_split_words": ""
 88 |             }
 89 |         )
 90 | 
 91 |         tf = time.time()
 92 |         print(f"[DEBUG] text_to_audio request took {tf - t0:.2f} seconds")
 93 |         print(f"[DEBUG] Replicate API output: {output}")
 94 | 
 95 |         # Fetch the audio from the returned URL
 96 |         audio_url = output
 97 |         async with aiohttp.ClientSession() as session:
 98 |             async with session.get(audio_url) as audio_response:
 99 |                 audio_bytes = await audio_response.read()
100 | 
101 |         out_dir = os.path.join(AUDIO_DATA_SINK, src)
102 |         if not os.path.exists(out_dir):
103 |             os.makedirs(out_dir)
104 |         out_path = os.path.join(out_dir, f"{uuid.uuid4().hex}_audio.wav")
105 | 
106 |         with open(out_path, "wb") as f:
107 |             f.write(audio_bytes)
108 | 
109 |         data = {
110 |             "text": text,
111 |             "date": pd.Timestamp.now(),
112 |             "model": "suno-ai/bark",
113 |             "provider": "Replicate",
114 |             "client_time": tf - t0,
115 |         }
116 |         df = pd.DataFrame(data, index=[0])
117 |         st.session_state["audio_gen_evals_df"] = pd.concat(
118 |             [st.session_state["audio_gen_evals_df"], df], ignore_index=True
119 |         )
120 | 
121 |         if src == 'user':
122 |             st.session_state["user_audio_bytes"] = audio_bytes
123 |         elif src == 'llm':
124 |             st.session_state["llm_audio_bytes"] = audio_bytes
125 |         
126 |         st.session_state["running_audio_job"] = False
127 | 
128 |     except Exception as e:
129 |         st.session_state["running_audio_job"] = False
130 |         raise Exception(f"Request failed: {e}")
131 | 
132 | async def text_to_image(text: str, negative_prompt: str, src: str = "human") -> str:
133 |     st.session_state["running_image_job"] = True
134 |     if st.session_state.image_model.startswith('stability-ai/stable-diffusion-3'):
135 |         input = {
136 |             "seed": 42,
137 |             "prompt": text,
138 |             "aspect_ratio": "3:2",
139 |             "output_quality": 79,
140 |             "negative_prompt": negative_prompt,
141 |         }
142 |     elif st.session_state.image_model.startswith('black-forest-labs/flux-dev'):
143 |         input = {
144 |             "prompt": text,
145 |             "guidance": 3.5,
146 |             "num_outputs": 1,
147 |             "aspect_ratio": "1:1",
148 |             "output_format": "webp",
149 |             "output_quality": 80,
150 |             "prompt_strength": 0.8
151 |         }
152 |     else:
153 |         raise ValueError(f'Unsupported video model/version type {st.session_state.image_model}.')
154 |     
155 |     print(f"[DEBUG] Generating image...")
156 |     t0 = time.time()
157 |     loop = asyncio.get_event_loop()
158 |     output = await loop.run_in_executor(None, replicate.run, st.session_state.image_model, input)
159 |     tf = time.time()
160 |     print(f"[DEBUG] text_to_image request took {tf - t0:.2f} seconds")
161 | 
162 |     if output and isinstance(output, list) and len(output) > 0:
163 |         image_url = output[0]
164 |         data = {
165 |             "text": text,
166 |             "negative_prompt": negative_prompt,
167 |             "image_url": image_url,
168 |             "date": pd.Timestamp.now(),
169 |             "model": st.session_state.image_model,
170 |             "provider": "Replicate",
171 |             "client_time": tf - t0
172 |         }
173 |         df = pd.DataFrame(data, index=[0])
174 |         st.session_state["image_gen_evals_df"] = pd.concat(
175 |             [st.session_state["image_gen_evals_df"], df], ignore_index=True
176 |         )
177 |         if src == 'user':
178 |             st.session_state["user_image_url"] = image_url
179 |         elif src == 'llm':
180 |             st.session_state["llm_image_url"] = image_url
181 |         st.session_state["running_image_job"] = False
182 |     else:
183 |         st.session_state["running_image_job"] = False
184 |         raise Exception("Text-to-image model did not return a valid URL.")
185 | 
186 | async def text_to_video(text: str, src: str = "user") -> str:
187 |     st.session_state.running_video_job = True
188 |     if st.session_state.video_model.startswith('lucataco/hotshot-xl'):
189 |         input = {
190 |             "prompt": text,
191 |             "mp4": True
192 |         }
193 |     elif st.session_state.video_model.startswith('deforum/deforum_stable_diffusion'):
194 |         input = {
195 |             "animation_prompts": text,
196 |             "sampler": "klms",
197 |             "max_frames": 100,
198 |         }
199 |     else:
200 |         raise ValueError(f'Unsupported video model/version type {st.session_state.video_model}.')
201 |     t0 = time.time()
202 |     print("[DEBUG] Generating video...")
203 |     loop = asyncio.get_event_loop()
204 |     output = await loop.run_in_executor(None, replicate.run, st.session_state.video_model, input)
205 |     tf = time.time()
206 |     print("[DEBUG] Video generation complete. %s" % output)
207 |     if output and isinstance(output, list) and len(output) > 0:
208 |         video_url = output[0]
209 |     elif output and isinstance(output, str):
210 |         video_url = output
211 |     data = {
212 |         "text": text,
213 |         "video_url": video_url,
214 |         "date": pd.Timestamp.now(),
215 |         "model": st.session_state.video_model,
216 |         "provider": "Replicate",
217 |         "client_time": tf - t0,
218 |     }
219 |     df = pd.DataFrame(data, index=[0])
220 |     st.session_state["video_gen_evals_df"] = pd.concat(
221 |         [st.session_state["video_gen_evals_df"], df], ignore_index=True
222 |     )
223 |     print("[DEBUG] Video URL:", video_url)
224 |     if src == 'user':
225 |             st.session_state["user_video_url"] = video_url
226 |     elif src == 'llm':
227 |         st.session_state["llm_video_url"] = video_url
228 |     st.session_state["running_video_job"] = False
229 | 
230 | def transcribe_audio(audio_data):
231 |     try:
232 |         if st.session_state.init_model_provider == "groq":
233 |             client = Groq()
234 |             model_wh = "whisper-large-v3"
235 |             print("[DEBUG] Transcribing audio with groq/whisper-1...")
236 |         elif st.session_state.init_model_provider == 'openai':
237 |             client = OpenAI()
238 |             model_wh = "whisper-1"
239 |             print("[DEBUG] Transcribing audio with openai/whisper-1...")
240 |         file_like = io.BytesIO(audio_data)
241 |         file_like.name = "audio.wav"  
242 |         file_like.seek(0)
243 | 
244 |         t0 = time.time()
245 |         transcription = client.audio.transcriptions.create(
246 |             model=model_wh, 
247 |             file=file_like, 
248 |             response_format="verbose_json"
249 |         )
250 |         tf = time.time()
251 |         print(f"[DEBUG] Transcription took {tf - t0:.2f} seconds")
252 |         print("Transcription", transcription)
253 |         data = {
254 |             "transcription": transcription.text,
255 |             "duration": transcription.duration,
256 |             "date": pd.Timestamp.now(),
257 |             "model": "openai/whisper-1",
258 |             "provider": "OpenAI",
259 |             "client_time": tf - t0,
260 |             "language": transcription.language,
261 |         }
262 |         df = pd.DataFrame(data, index=[0])
263 |         st.session_state["transcription_evals_df"] = pd.concat(
264 |             [st.session_state["transcription_evals_df"], df], ignore_index=True
265 |         )
266 | 
267 |         print("[DEBUG] Transcription type:", type(transcription))
268 |         print("[DEBUG] Transcription:", transcription)
269 |         return transcription
270 | 
271 |     except Exception as e:
272 |         print(f"[ERROR] An error occurred during transcription: {e}")
273 |         return None


--------------------------------------------------------------------------------
/notebooks/models-everywhere.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Building Your First Multimodal GenAI App"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## Welcome to the Workshop on Building Composable AI Systems 🚀\n",
  15 |     "\n",
  16 |     "In this workshop, we're diving into how to create powerful and flexible AI applications using a code-first approach. The world of AI is evolving rapidly, and we now have access to a multitude of models for generating text, images, audio, and more. But to rell leverage the power of these models, we need to think about **composability**: how we can connect and automate AI capabilities in ways that fit our particular needs.\n",
  17 |     "\n",
  18 |     "### Why Use Code Interfaces Over Pre-Built Tools?\n",
  19 |     "\n",
  20 |     "You might wonder, \"Why not just use simple, web-based tools for these tasks?\" The answer lies in the power of customization, automation, and scalability. Let's explore some real-world scenarios:\n",
  21 |     "\n",
  22 |     "---\n",
  23 |     "\n",
  24 |     "### Scenario 1: Rapid Image Generation for a Marketing Team 🎨\n",
  25 |     "Imagine you're part of a marketing team launching a new product. You need to generate dozens of image concepts quickly to test different styles and messages. Using a code-based approach, you can:\n",
  26 |     "\n",
  27 |     "- **Prompt** a language model to come up with creative image descriptions.\n",
  28 |     "- **Generate** images from those descriptions using an image model.\n",
  29 |     "- **Filter** and select the best images based on specific criteria.\n",
  30 |     "- **Send** the top images to your team’s Slack or email for feedback.\n",
  31 |     "\n",
  32 |     "This automated workflow saves hours of manual work and ensures your team has more creative options to consider.\n",
  33 |     "\n",
  34 |     "\n",
  35 |     "\n",
  36 |     "### Scenario 2: Creating Background Music for Content Creators 🎶\n",
  37 |     "You're a content creator working on a podcast or video series. Each episode needs custom background music that matches the theme and mood. With this approach, you can:\n",
  38 |     "\n",
  39 |     "- **Generate** multiple music samples using a generative audio model.\n",
  40 |     "- **Adjust** the style, tempo, or instrumentation on the fly.\n",
  41 |     "- **Save** and organize the best samples for your project.\n",
  42 |     "\n",
  43 |     "This allows you to experiment freely and find the perfect sound without having to rely on pre-made music libraries.\n",
  44 |     "\n",
  45 |     "\n",
  46 |     "\n",
  47 |     "### Scenario 3: Interactive Storytelling or Live Events 🎭\n",
  48 |     "Imagine hosting a live event where you want to engage the audience with real-time storytelling. You can build an app that:\n",
  49 |     "\n",
  50 |     "- **Listens** to audience suggestions through voice input.\n",
  51 |     "- **Generates** narrative twists or visual art based on those suggestions.\n",
  52 |     "- **Plays** custom soundscapes to match the unfolding story.\n",
  53 |     "\n",
  54 |     "\n",
  55 |     "\n",
  56 |     "### Scenario 4: Team Collaboration & Idea Generation 💡\n",
  57 |     "Consider a product team brainstorming new features for an app. Instead of manually jotting down ideas and creating prototypes, you can build a pipeline that:\n",
  58 |     "\n",
  59 |     "- **Generates** feature ideas using an LLM.\n",
  60 |     "- **Creates** visual prototypes from those ideas using an image model.\n",
  61 |     "- **Filters** and prioritizes the best concepts automatically.\n",
  62 |     "\n",
  63 |     "This speeds up the ideation process and allows your team to explore more possibilities in less time.\n",
  64 |     "\n",
  65 |     "### Workshop Overview 🛠️\n",
  66 |     "\n",
  67 |     "In this workshop, you will learn how to:\n",
  68 |     "- **Compose** AI models to build custom applications.\n",
  69 |     "- **Automate** workflows using AI-generated content.\n",
  70 |     "- **Experiment** with different models for text, audio, and images.\n",
  71 |     "- **Deploy** your own composable AI systems for real-world use cases.\n",
  72 |     "\n",
  73 |     "We’ll start with a high-level look at composability, inspired by the Unix philosophy: simple, modular components that can be easily combined. Then, we'll get hands-on, building systems that turn your ideas into reality with minimal effort.\n",
  74 |     "\n",
  75 |     "Let’s get started! 🚀"
  76 |    ]
  77 |   },
  78 |   {
  79 |    "cell_type": "markdown",
  80 |    "metadata": {},
  81 |    "source": [
  82 |     "First note that these types of multimodal aproaches are already available in apps, such as ChatGPT:"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "code",
  87 |    "execution_count": 51,
  88 |    "metadata": {},
  89 |    "outputs": [
  90 |     {
  91 |      "data": {
  92 |       "text/html": [
  93 |        "<video src=\"vid/Dream Advertising Marketing Campaign.mp4\" controls  >\n",
  94 |        "      Your browser does not support the <code>video</code> element.\n",
  95 |        "    </video>"
  96 |       ],
  97 |       "text/plain": [
  98 |        "<IPython.core.display.Video object>"
  99 |       ]
 100 |      },
 101 |      "execution_count": 51,
 102 |      "metadata": {},
 103 |      "output_type": "execute_result"
 104 |     }
 105 |    ],
 106 |    "source": [
 107 |     "from IPython.display import Video\n",
 108 |     "\n",
 109 |     "Video(\"vid/Dream Advertising Marketing Campaign.mp4\")\n"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "markdown",
 114 |    "metadata": {},
 115 |    "source": [
 116 |     "But this approach ties you to particular models:"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "markdown",
 121 |    "metadata": {},
 122 |    "source": [
 123 |     "![Alt text](img/multimodal_app_1.png)"
 124 |    ]
 125 |   },
 126 |   {
 127 |    "cell_type": "markdown",
 128 |    "metadata": {},
 129 |    "source": [
 130 |     "In this notebook, we'll explore a variety of SOTA GenAI models and get a sense of how to stitch them together!"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "markdown",
 135 |    "metadata": {},
 136 |    "source": [
 137 |     "![Alt text](img/composability.png)"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {},
 143 |    "source": [
 144 |     "![Alt text](img/karpathy-austen.png)"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": 52,
 150 |    "metadata": {},
 151 |    "outputs": [
 152 |     {
 153 |      "data": {
 154 |       "text/html": [
 155 |        "<video src=\"vid/karpathy-austen.mp4\" controls  >\n",
 156 |        "      Your browser does not support the <code>video</code> element.\n",
 157 |        "    </video>"
 158 |       ],
 159 |       "text/plain": [
 160 |        "<IPython.core.display.Video object>"
 161 |       ]
 162 |      },
 163 |      "execution_count": 52,
 164 |      "metadata": {},
 165 |      "output_type": "execute_result"
 166 |     }
 167 |    ],
 168 |    "source": [
 169 |     "from IPython.display import Video\n",
 170 |     "\n",
 171 |     "Video(\"vid/karpathy-austen.mp4\")\n"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "markdown",
 176 |    "metadata": {},
 177 |    "source": [
 178 |     "![Alt text](img/unix-phil.png)"
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "markdown",
 183 |    "metadata": {},
 184 |    "source": [
 185 |     "![Alt text](img/apple-counterpoint.png)"
 186 |    ]
 187 |   },
 188 |   {
 189 |    "cell_type": "markdown",
 190 |    "metadata": {},
 191 |    "source": [
 192 |     "![Alt text](img/ai-evolution.png)"
 193 |    ]
 194 |   },
 195 |   {
 196 |    "cell_type": "markdown",
 197 |    "metadata": {},
 198 |    "source": [
 199 |     "![Alt text](img/importance-comp.png)"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "markdown",
 204 |    "metadata": {},
 205 |    "source": [
 206 |     "![Alt text](img/comp-nec.png)"
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "markdown",
 211 |    "metadata": {},
 212 |    "source": [
 213 |     "## Get our API Keys in our environment"
 214 |    ]
 215 |   },
 216 |   {
 217 |    "cell_type": "markdown",
 218 |    "metadata": {},
 219 |    "source": [
 220 |     "- Create a [Groq](https://groq.com/) account and navigate [here to get your API key](https://console.groq.com/keys). They have a free tier with a bunch of LLMs (see screenshot below)!\n",
 221 |     "- If you'd prefer to use OpenAI, you can do that and get [your API key here](https://platform.openai.com/api-keys).\n",
 222 |     "- To use the models below as is, you'll need a [Replicate account](https://replicate.com/). If you're using this notebook in a workshop, chances are Hugo is able to provision free Replicate credits for you so ask him, if he hasn't mentioned it.\n",
 223 |     "- Many of these models [you can also find on HuggingFace](https://huggingface.co/models), if you'd prefer."
 224 |    ]
 225 |   },
 226 |   {
 227 |    "cell_type": "markdown",
 228 |    "metadata": {},
 229 |    "source": [
 230 |     "![Alt text](img/multimodal_app_2.png)"
 231 |    ]
 232 |   },
 233 |   {
 234 |    "cell_type": "code",
 235 |    "execution_count": 53,
 236 |    "metadata": {},
 237 |    "outputs": [
 238 |     {
 239 |      "name": "stdout",
 240 |      "output_type": "stream",
 241 |      "text": [
 242 |       "Replicate API key captured successfully!\n",
 243 |       "Groq API key captured successfully!\n"
 244 |      ]
 245 |     }
 246 |    ],
 247 |    "source": [
 248 |     "import getpass\n",
 249 |     "\n",
 250 |     "\n",
 251 |     "# Prompt for the Replicate API key\n",
 252 |     "replicate_api_key = getpass.getpass(\"Please enter your Replicate API key: \")\n",
 253 |     "print(\"Replicate API key captured successfully!\")\n",
 254 |     "\n",
 255 |     "# Prompt for the Grok API key\n",
 256 |     "groq_api_key = getpass.getpass(\"Please enter your Groq API key: \")\n",
 257 |     "print(\"Groq API key captured successfully!\")\n",
 258 |     "\n",
 259 |     "# # Prompt for the OpenAI API key\n",
 260 |     "# openai_api_key = getpass.getpass(\"Please enter your OpenAI API key: \")\n",
 261 |     "# print(\"Replicate OpenAI key captured successfully!\")\n"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "metadata": {},
 267 |    "source": [
 268 |     "## Suno Bark: text to audio"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "markdown",
 273 |    "metadata": {},
 274 |    "source": [
 275 |     "First up, we'll experiment with the [Suno Bark](https://github.com/suno-ai/bark) text to audio model:"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": 54,
 281 |    "metadata": {},
 282 |    "outputs": [
 283 |     {
 284 |      "name": "stdout",
 285 |      "output_type": "stream",
 286 |      "text": [
 287 |       "https://replicate.delivery/yhqm/wybNlPef3IvDRE4hiChxvdVS0n7snQZo55gs4bfdoXjyg6bnA/output.wav\n"
 288 |      ]
 289 |     }
 290 |    ],
 291 |    "source": [
 292 |     "import replicate\n",
 293 |     "\n",
 294 |     "# Create a Replicate client instance with the API token\n",
 295 |     "client = replicate.Client(api_token=replicate_api_key)\n",
 296 |     "\n",
 297 |     "output = client.run(\n",
 298 |     "    \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n",
 299 |     "    input={\n",
 300 |     "        \"gen_text\": \"captain hugo, on duty!\",\n",
 301 |     "        \"ref_text\": \"never underestimate the power of the scout's code\",\n",
 302 |     "        \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n",
 303 |     "        \"remove_silence\": True,\n",
 304 |     "        \"custom_split_words\": \"\"\n",
 305 |     "    }\n",
 306 |     ")\n",
 307 |     "print(output)"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "markdown",
 312 |    "metadata": {},
 313 |    "source": [
 314 |     "### LLM output --> Suno bark"
 315 |    ]
 316 |   },
 317 |   {
 318 |    "cell_type": "markdown",
 319 |    "metadata": {},
 320 |    "source": [
 321 |     "But what if we want to pipe the output of an LLM into Bark?"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 55,
 327 |    "metadata": {},
 328 |    "outputs": [],
 329 |    "source": [
 330 |     "from groq import Groq\n",
 331 |     "\n",
 332 |     "def get_llm_response(user_input):\n",
 333 |     "    client = Groq(\n",
 334 |     "        api_key=groq_api_key)\n",
 335 |     "\n",
 336 |     "    response = client.chat.completions.create(\n",
 337 |     "        messages=[\n",
 338 |     "            {\n",
 339 |     "                \"role\": \"user\",\n",
 340 |     "                \"content\": user_input,\n",
 341 |     "            }\n",
 342 |     "        ],\n",
 343 |     "        model=\"llama3-8b-8192\",\n",
 344 |     "    )\n",
 345 |     "\n",
 346 |     "    return response.choices[0].message.content\n",
 347 |     "\n",
 348 |     "# from openai import OpenAI\n",
 349 |     "# import os\n",
 350 |     "\n",
 351 |     "\n",
 352 |     "# def get_llm_response(user_input):\n",
 353 |     "#     client = OpenAI(api_key=openai_api_key)\n",
 354 |     "    \n",
 355 |     "#     response = client.chat.completions.create(\n",
 356 |     "#         model=\"gpt-3.5-turbo-0613\",\n",
 357 |     "#         messages=[\n",
 358 |     "#     {\"role\": \"user\", \"content\": user_input}\n",
 359 |     "#   ]\n",
 360 |     "#         )\n",
 361 |     "#     return response.choices[0].message.content"
 362 |    ]
 363 |   },
 364 |   {
 365 |    "cell_type": "code",
 366 |    "execution_count": 56,
 367 |    "metadata": {},
 368 |    "outputs": [
 369 |     {
 370 |      "name": "stdout",
 371 |      "output_type": "stream",
 372 |      "text": [
 373 |       "Here is a short pirate sea shanty:\n",
 374 |       "\n",
 375 |       "(sung to the tune of \"What Shall We Do with a Drunken Sailor\")\n",
 376 |       "\n",
 377 |       "Oh, the captain stands on the quarterdeck high\n",
 378 |       "With his trusty compass and an eye on the sky\n",
 379 |       "He's searchin' for treasure, plunder and gold\n",
 380 |       "And a crew to sing along as we sail to old\n",
 381 |       "\n",
 382 |       "We'll hoist the Jolly Roger, let it wave in the breeze\n",
 383 |       "And sing and shout and drink our grog with ease\n",
 384 |       "We'll dance on the deck and play our sea shanty tune\n",
 385 |       "For we're pirates bold, on the ocean we've been known\n",
 386 |       "\n",
 387 |       "Oh, we'll ride the waves and sing as we go\n",
 388 |       "On our journey to the Caribbean from the Bosphorus, you know\n",
 389 |       "We'll drink and we'll jest and we'll tell our yarns\n",
 390 |       "For we're pirates, and the sea is our home, and we swear no barns!\n",
 391 |       "\n",
 392 |       "Yarrr!\n"
 393 |      ]
 394 |     }
 395 |    ],
 396 |    "source": [
 397 |     "song = get_llm_response(\"a short pirates sea shanty\")\n",
 398 |     "print(song)"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "code",
 403 |    "execution_count": 57,
 404 |    "metadata": {},
 405 |    "outputs": [
 406 |     {
 407 |      "name": "stdout",
 408 |      "output_type": "stream",
 409 |      "text": [
 410 |       "https://replicate.delivery/yhqm/vx3JskrK62KeDaDSbD3fgCuv04yMDS4XNq0j8fcvA3Slh6bnA/output.wav\n"
 411 |      ]
 412 |     }
 413 |    ],
 414 |    "source": [
 415 |     "output = client.run(\n",
 416 |     "    \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n",
 417 |     "    input={\n",
 418 |     "        \"gen_text\": song,\n",
 419 |     "        \"ref_text\": \"never underestimate the power of the scout's code\",\n",
 420 |     "        \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n",
 421 |     "        \"remove_silence\": True,\n",
 422 |     "        \"custom_split_words\": \"\"\n",
 423 |     "    }\n",
 424 |     ")\n",
 425 |     "print(output)"
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "markdown",
 430 |    "metadata": {},
 431 |    "source": [
 432 |     "## Text to music w/ meta musicgen"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "markdown",
 437 |    "metadata": {},
 438 |    "source": [
 439 |     "What if we wanted to create some music with text? Let's try Musicgen from Meta."
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "code",
 444 |    "execution_count": 58,
 445 |    "metadata": {},
 446 |    "outputs": [
 447 |     {
 448 |      "name": "stdout",
 449 |      "output_type": "stream",
 450 |      "text": [
 451 |       "https://replicate.delivery/yhqm/MBBfVjxfEgjgc0yXBDICx5xDQIZaD8jI2OHfS5qG0u2zi6bnA/out.mp3\n"
 452 |      ]
 453 |     }
 454 |    ],
 455 |    "source": [
 456 |     "input = {\n",
 457 |     "    \"prompt\": \"Horns and Drums. Edo25 major g melodies that sound triumphant and cinematic. Leading up to a crescendo that resolves in a 9th harmonic\",\n",
 458 |     "    \"model_version\": \"stereo-large\",\n",
 459 |     "    \"output_format\": \"mp3\",\n",
 460 |     "    \"normalization_strategy\": \"peak\"\n",
 461 |     "}\n",
 462 |     "\n",
 463 |     "output = client.run(\n",
 464 |     "    \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n",
 465 |     "    input=input\n",
 466 |     ")\n",
 467 |     "print(output)\n",
 468 |     "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..."
 469 |    ]
 470 |   },
 471 |   {
 472 |    "cell_type": "code",
 473 |    "execution_count": 59,
 474 |    "metadata": {},
 475 |    "outputs": [
 476 |     {
 477 |      "name": "stdout",
 478 |      "output_type": "stream",
 479 |      "text": [
 480 |       "https://replicate.delivery/yhqm/xvXTf1MqtMTXUCmJKeECFHhgjaYqp6svezbHf0bCfon9TqvdC/out.mp3\n"
 481 |      ]
 482 |     }
 483 |    ],
 484 |    "source": [
 485 |     "input = {\n",
 486 |     "    \"prompt\": \"Ancient Trip Hop with Throat Singing\",\n",
 487 |     "    \"model_version\": \"stereo-large\",\n",
 488 |     "    \"output_format\": \"mp3\",\n",
 489 |     "    \"normalization_strategy\": \"peak\",\n",
 490 |     "    \"duration\": 30 \n",
 491 |     "}\n",
 492 |     "\n",
 493 |     "output = client.run(\n",
 494 |     "    \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n",
 495 |     "    input=input\n",
 496 |     ")\n",
 497 |     "print(output)\n",
 498 |     "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..."
 499 |    ]
 500 |   },
 501 |   {
 502 |    "cell_type": "markdown",
 503 |    "metadata": {},
 504 |    "source": [
 505 |     "## Text to music with riffusion"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "markdown",
 510 |    "metadata": {},
 511 |    "source": [
 512 |     "There are lots of other models to experiment with, such as riffusion:"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "code",
 517 |    "execution_count": 60,
 518 |    "metadata": {},
 519 |    "outputs": [
 520 |     {
 521 |      "name": "stdout",
 522 |      "output_type": "stream",
 523 |      "text": [
 524 |       "{'audio': 'https://replicate.delivery/czjl/KJn0JSKssfyflUaYNfHveLeeVEXwYQJc1edZGntKNhfeXl6bnA/gen_sound.wav', 'spectrogram': 'https://replicate.delivery/czjl/ykuCE1ZPVRL3JtnwGf3MoBW5e461pAvhDfh1uWUDEt3Xl6bnA/spectrogram.jpg'}\n"
 525 |      ]
 526 |     }
 527 |    ],
 528 |    "source": [
 529 |     "output = client.run(\n",
 530 |     "    \"riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05\",\n",
 531 |     "    input={\n",
 532 |     "        \"alpha\": 0.5,\n",
 533 |     "        \"prompt_a\": \"West African Desert Blues\",\n",
 534 |     "        \"prompt_b\": \"Throat Singing\",\n",
 535 |     "        \"denoising\": 0.75,\n",
 536 |     "        \"seed_image_id\": \"vibes\",\n",
 537 |     "        \"num_inference_steps\": 50\n",
 538 |     "    }\n",
 539 |     ")\n",
 540 |     "print(output)"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "markdown",
 545 |    "metadata": {},
 546 |    "source": [
 547 |     "___"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "markdown",
 552 |    "metadata": {},
 553 |    "source": [
 554 |     "## Experiment: One prompt to many models"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "markdown",
 559 |    "metadata": {},
 560 |    "source": [
 561 |     "Now what if we wanted to use a single prompt to create text, audio, images, and video?"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": 61,
 567 |    "metadata": {},
 568 |    "outputs": [],
 569 |    "source": [
 570 |     "message = \"The Waffle House is really messing up the pancakes and bacon tonight HOLY MOLEY and there's anarchist jazz also!\""
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "markdown",
 575 |    "metadata": {},
 576 |    "source": [
 577 |     "### text to image"
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "code",
 582 |    "execution_count": 62,
 583 |    "metadata": {},
 584 |    "outputs": [
 585 |     {
 586 |      "name": "stdout",
 587 |      "output_type": "stream",
 588 |      "text": [
 589 |       "['https://replicate.delivery/pbxt/LfupBRuofqjyp08abH0BgfbeJIWdqI323XqVyc8WUmxiN13OB/R8__00001_.webp']\n"
 590 |      ]
 591 |     }
 592 |    ],
 593 |    "source": [
 594 |     "input = {\n",
 595 |     "    \"prompt\": message\n",
 596 |     "}\n",
 597 |     "\n",
 598 |     "output = client.run(\n",
 599 |     "    \"fofr/epicrealismxl-lightning-hades:0ca10b1fd361c1c5568720736411eaa89d9684415eb61fd36875b4d3c20f605a\",\n",
 600 |     "    input=input\n",
 601 |     ")\n",
 602 |     "print(output)\n",
 603 |     "#=> [\"https://replicate.delivery/pbxt/ulYZRIyAUDYpOZfl7OjhrKx..."
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "markdown",
 608 |    "metadata": {},
 609 |    "source": [
 610 |     "### text to audio"
 611 |    ]
 612 |   },
 613 |   {
 614 |    "cell_type": "code",
 615 |    "execution_count": 63,
 616 |    "metadata": {},
 617 |    "outputs": [
 618 |     {
 619 |      "data": {
 620 |       "text/plain": [
 621 |        "\"The Waffle House is really messing up the pancakes and bacon tonight HOLY MOLEY and there's anarchist jazz also!\""
 622 |       ]
 623 |      },
 624 |      "execution_count": 63,
 625 |      "metadata": {},
 626 |      "output_type": "execute_result"
 627 |     }
 628 |    ],
 629 |    "source": [
 630 |     "message"
 631 |    ]
 632 |   },
 633 |   {
 634 |    "cell_type": "code",
 635 |    "execution_count": 64,
 636 |    "metadata": {},
 637 |    "outputs": [
 638 |     {
 639 |      "name": "stdout",
 640 |      "output_type": "stream",
 641 |      "text": [
 642 |       "https://replicate.delivery/yhqm/MOCU8DxsSmqzHZzoVffRremzP4Fnabl75tHc4nkcuUS0o6bnA/output.wav\n"
 643 |      ]
 644 |     }
 645 |    ],
 646 |    "source": [
 647 |     "output = client.run(\n",
 648 |     "    \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n",
 649 |     "    input={\n",
 650 |     "        \"gen_text\": message,\n",
 651 |     "        \"ref_text\": \"never underestimate the power of the scout's code\",\n",
 652 |     "        \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n",
 653 |     "        \"remove_silence\": True,\n",
 654 |     "        \"custom_split_words\": \"\"\n",
 655 |     "    }\n",
 656 |     ")\n",
 657 |     "print(output)"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "markdown",
 662 |    "metadata": {},
 663 |    "source": [
 664 |     "### text to music"
 665 |    ]
 666 |   },
 667 |   {
 668 |    "cell_type": "code",
 669 |    "execution_count": 65,
 670 |    "metadata": {},
 671 |    "outputs": [
 672 |     {
 673 |      "name": "stdout",
 674 |      "output_type": "stream",
 675 |      "text": [
 676 |       "https://replicate.delivery/yhqm/fKPRqJ13XEy3PaiemzVftJhavz5bydhOLjvMjQzoHvrCr6bnA/out.mp3\n"
 677 |      ]
 678 |     }
 679 |    ],
 680 |    "source": [
 681 |     "input = {\n",
 682 |     "    \"prompt\": message,\n",
 683 |     "    \"model_version\": \"stereo-large\",\n",
 684 |     "    \"output_format\": \"mp3\",\n",
 685 |     "    \"normalization_strategy\": \"peak\",\n",
 686 |     "    \"duration\": 30 \n",
 687 |     "}\n",
 688 |     "\n",
 689 |     "output = client.run(\n",
 690 |     "    \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n",
 691 |     "    input=input\n",
 692 |     ")\n",
 693 |     "print(output)\n",
 694 |     "#=> \"https://replicate.delivery/pbxt/OeLYIQiltdzMaCex1shlEFy6..."
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "markdown",
 699 |    "metadata": {},
 700 |    "source": [
 701 |     "## Many models at once"
 702 |    ]
 703 |   },
 704 |   {
 705 |    "cell_type": "markdown",
 706 |    "metadata": {},
 707 |    "source": [
 708 |     "Let's write some utility functions that use these models:"
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": 66,
 714 |    "metadata": {},
 715 |    "outputs": [],
 716 |    "source": [
 717 |     "def generate_epic_realism(prompt, api_token):\n",
 718 |     "    # Create a Replicate client instance with the API token\n",
 719 |     "    client = replicate.Client(api_token=replicate_api_key)\n",
 720 |     "\n",
 721 |     "    # Define the input parameters for the model\n",
 722 |     "    input_data = {\n",
 723 |     "        \"prompt\": prompt\n",
 724 |     "    }\n",
 725 |     "\n",
 726 |     "    # Run the model using Replicate API\n",
 727 |     "    output = client.run(\n",
 728 |     "        \"fofr/epicrealismxl-lightning-hades:0ca10b1fd361c1c5568720736411eaa89d9684415eb61fd36875b4d3c20f605a\",\n",
 729 |     "        input=input_data\n",
 730 |     "    )\n",
 731 |     "    \n",
 732 |     "    return output\n",
 733 |     "\n",
 734 |     "\n",
 735 |     "\n",
 736 |     "# def generate_suno_bark(prompt, api_token, text_temp=0.7, output_full=False, waveform_temp=0.7, history_prompt=\"announcer\"):\n",
 737 |     "#     # Create a Replicate client instance with the API token\n",
 738 |     "#     client = replicate.Client(api_token=replicate_api_key)\n",
 739 |     "\n",
 740 |     "#     # Define the input parameters for the model\n",
 741 |     "#     input_params = {\n",
 742 |     "#         \"prompt\": prompt,\n",
 743 |     "#         \"text_temp\": text_temp,\n",
 744 |     "#         \"output_full\": output_full,\n",
 745 |     "#         \"waveform_temp\": waveform_temp,\n",
 746 |     "#         \"history_prompt\": \"zh_speaker_7\",\n",
 747 |     "#     }\n",
 748 |     "\n",
 749 |     "#     # Run the model using Replicate API\n",
 750 |     "#     try:\n",
 751 |     "#         output = client.run(\n",
 752 |     "#             \"suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787\",\n",
 753 |     "#             input=input_params\n",
 754 |     "#         )\n",
 755 |     "#         return output\n",
 756 |     "#     except Exception as e:\n",
 757 |     "#         print(f\"Error: {e}\")\n",
 758 |     "#         return None\n",
 759 |     "\n",
 760 |     "\n",
 761 |     "\n",
 762 |     "\n",
 763 |     "def generate_music_gen(prompt, api_token, duration=30, model_version=\"stereo-large\", output_format=\"mp3\", normalization_strategy=\"peak\"):\n",
 764 |     "    # Create a Replicate client instance with the API token\n",
 765 |     "    client = replicate.Client(api_token=replicate_api_key)\n",
 766 |     "\n",
 767 |     "    # Define the input parameters for the model\n",
 768 |     "    input_data = {\n",
 769 |     "        \"prompt\": prompt,\n",
 770 |     "        \"model_version\": model_version,\n",
 771 |     "        \"output_format\": output_format,\n",
 772 |     "        \"normalization_strategy\": normalization_strategy,\n",
 773 |     "        \"duration\": duration \n",
 774 |     "    }\n",
 775 |     "\n",
 776 |     "    # Run the model using Replicate API\n",
 777 |     "    output = client.run(\n",
 778 |     "        \"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb\",\n",
 779 |     "        input=input_data\n",
 780 |     "    )\n",
 781 |     "    \n",
 782 |     "    return output\n",
 783 |     "\n",
 784 |     "\n",
 785 |     "def generate_suno_bark(prompt, api_token, text_temp=0.7, output_full=False, waveform_temp=0.7, history_prompt=\"announcer\"):\n",
 786 |     "    # Create a Replicate client instance with the API token\n",
 787 |     "    client = replicate.Client(api_token=replicate_api_key)\n",
 788 |     "\n",
 789 |     "    # Define the input parameters for the model\n",
 790 |     "    input_params = {\n",
 791 |     "        \"prompt\": prompt,\n",
 792 |     "        \"text_temp\": text_temp,\n",
 793 |     "        \"output_full\": output_full,\n",
 794 |     "        \"waveform_temp\": waveform_temp,\n",
 795 |     "        \"history_prompt\": \"announcer\",\n",
 796 |     "    }\n",
 797 |     "\n",
 798 |     "    # Run the model using Replicate API\n",
 799 |     "    try:\n",
 800 |     "        output = client.run(\n",
 801 |     "            \"suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787\",\n",
 802 |     "            input=input_params\n",
 803 |     "        )\n",
 804 |     "        return output\n",
 805 |     "    except Exception as e:\n",
 806 |     "        print(f\"Error: {e}\")\n",
 807 |     "        return None\n",
 808 |     "\n",
 809 |     "\n",
 810 |     "def generate_f5(prompt, api_token):\n",
 811 |     "    # Create a Replicate client instance with the API token\n",
 812 |     "    client = replicate.Client(api_token=replicate_api_key)\n",
 813 |     "    output = client.run(\n",
 814 |     "        \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n",
 815 |     "        input={\n",
 816 |     "            \"gen_text\": prompt,\n",
 817 |     "            \"ref_text\": \"never underestimate the power of the scout's code\",\n",
 818 |     "            \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n",
 819 |     "            \"remove_silence\": True,\n",
 820 |     "            \"custom_split_words\": \"\"\n",
 821 |     "        }\n",
 822 |     "    )\n",
 823 |     "    return output\n",
 824 |     "\n",
 825 |     "\n"
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "markdown",
 830 |    "metadata": {},
 831 |    "source": [
 832 |     "Let's test them out:"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "code",
 837 |    "execution_count": 67,
 838 |    "metadata": {},
 839 |    "outputs": [
 840 |     {
 841 |      "name": "stdout",
 842 |      "output_type": "stream",
 843 |      "text": [
 844 |       "['https://replicate.delivery/pbxt/iPF40CjW9CIvNBhKCzO6uyfcU3HjjyDenf6d2iTWen4Jb13OB/R8__00001_.webp']\n",
 845 |       "https://replicate.delivery/yhqm/DeYnpvdxtkWpZ6xNIfXzfesg5WWJbiqviFS3HgOomKuzi13OB/output.wav\n",
 846 |       "https://replicate.delivery/yhqm/2QbO5lBMp3LlE1d3hBuF9EIYf4ETGW43EDI3i2wLg955setTA/out.mp3\n"
 847 |      ]
 848 |     }
 849 |    ],
 850 |    "source": [
 851 |     "message = \"crazy wild zombie party at the blaring symphony orchestra\"\n",
 852 |     "output = generate_epic_realism(message, replicate_api_key)\n",
 853 |     "print(output)\n",
 854 |     "\n",
 855 |     "output = generate_f5(message, replicate_api_key)\n",
 856 |     "print(output)\n",
 857 |     "\n",
 858 |     "output = generate_music_gen(message, replicate_api_key)\n",
 859 |     "print(output)\n"
 860 |    ]
 861 |   },
 862 |   {
 863 |    "cell_type": "code",
 864 |    "execution_count": 68,
 865 |    "metadata": {},
 866 |    "outputs": [
 867 |     {
 868 |      "name": "stdout",
 869 |      "output_type": "stream",
 870 |      "text": [
 871 |       "Epic Realism Output:\n",
 872 |       "['https://replicate.delivery/pbxt/i04ppXTefyl3m05U5nizQkY8L1qE0wc3JIe5OyOTDztu26bnA/R8__00001_.webp']\n",
 873 |       "Meta MusicGen Output:\n",
 874 |       "https://replicate.delivery/yhqm/jOIw2FwqmDIROFcBsPgjKTiovMCeWXWUNCS975lG6MDUuetTA/out.mp3\n",
 875 |       "Suno Bark Output:\n",
 876 |       "https://replicate.delivery/yhqm/0s8HPb8LgmImGtIubO1qtlG5UVchfi0SHa7lAeiweE4X76bnA/output.wav\n"
 877 |      ]
 878 |     }
 879 |    ],
 880 |    "source": [
 881 |     "# Define your API token and prompt message\n",
 882 |     "# api_token = 'your_api_token_here'\n",
 883 |     "message = \"The Waffle House messing it up for real with the pancakes and bacon and punk abstract jazz, yo!\"\n",
 884 |     "\n",
 885 |     "# Run the Epic Realism model\n",
 886 |     "epicrealism_output = generate_epic_realism(message, replicate_api_key)\n",
 887 |     "print(\"Epic Realism Output:\")\n",
 888 |     "print(epicrealism_output)\n",
 889 |     "\n",
 890 |     "# Run the Meta MusicGen model\n",
 891 |     "musicgen_output = generate_music_gen(message, replicate_api_key)\n",
 892 |     "print(\"Meta MusicGen Output:\")\n",
 893 |     "print(musicgen_output)\n",
 894 |     "\n",
 895 |     "# Run the Suno Bark model\n",
 896 |     "bark_output = generate_f5(message, replicate_api_key)\n",
 897 |     "print(\"Suno Bark Output:\")\n",
 898 |     "print(bark_output)\n",
 899 |     "\n"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "markdown",
 904 |    "metadata": {},
 905 |    "source": [
 906 |     "### Experiment: text to video"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": 69,
 912 |    "metadata": {},
 913 |    "outputs": [
 914 |     {
 915 |      "name": "stdout",
 916 |      "output_type": "stream",
 917 |      "text": [
 918 |       "https://replicate.delivery/yhqm/hOb98yie1egiX0nrtfm7pRIxjHGOKFDAtNykfdExfse6HYf2JA/out.mp4\n"
 919 |      ]
 920 |     }
 921 |    ],
 922 |    "source": [
 923 |     "message = \"The Waffle House messing it up for real with the pancakes and bacon and punk abstract jazz, yo!\"\n",
 924 |     "\n",
 925 |     "input = {\n",
 926 |     "    \"sampler\": \"klms\",\n",
 927 |     "    \"max_frames\": 100,\n",
 928 |     "    \"animation_prompts\": message\n",
 929 |     "}\n",
 930 |     "\n",
 931 |     "output = client.run(\n",
 932 |     "    \"deforum/deforum_stable_diffusion:e22e77495f2fb83c34d5fae2ad8ab63c0a87b6b573b6208e1535b23b89ea66d6\",\n",
 933 |     "    input=input\n",
 934 |     ")\n",
 935 |     "print(output)\n",
 936 |     "#=> \"https://replicate.delivery/mgxm/873a1cc7-0427-4e8d-ab3c-..."
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "markdown",
 941 |    "metadata": {},
 942 |    "source": [
 943 |     "![Alt text](img/today.png)"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "markdown",
 948 |    "metadata": {},
 949 |    "source": [
 950 |     "![Alt text](img/explore.png)"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "markdown",
 955 |    "metadata": {},
 956 |    "source": []
 957 |   },
 958 |   {
 959 |    "cell_type": "markdown",
 960 |    "metadata": {},
 961 |    "source": [
 962 |     "![Alt text](img/models.png)"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": 70,
 968 |    "metadata": {},
 969 |    "outputs": [
 970 |     {
 971 |      "name": "stdout",
 972 |      "output_type": "stream",
 973 |      "text": [
 974 |       "https://replicate.delivery/yhqm/l21N0GeER0z3KqcMwfxznap2QKF0oBD3mjVzLxIgDUUjh9tTA/output.wav\n"
 975 |      ]
 976 |     }
 977 |    ],
 978 |    "source": [
 979 |     "# Create a Replicate client instance with the API token\n",
 980 |     "client = replicate.Client(api_token=replicate_api_key)\n",
 981 |     "\n",
 982 |     "output = client.run(\n",
 983 |     "    \"x-lance/f5-tts:87faf6dd7a692dd82043f662e76369cab126a2cf1937e25a9d41e0b834fd230e\",\n",
 984 |     "    input={\n",
 985 |     "        \"gen_text\": \"captain hugo, on duty!\",\n",
 986 |     "        \"ref_text\": \"never underestimate the power of the scout's code\",\n",
 987 |     "        \"ref_audio\": \"https://replicate.delivery/pbxt/LnHEJTVWhjLcpGQJTBralyztLwl8diaLyHjP2a1KXJ8dxVWv/Teemo_Original_Taunt.ogg\",\n",
 988 |     "        \"remove_silence\": True,\n",
 989 |     "        \"custom_split_words\": \"\"\n",
 990 |     "    }\n",
 991 |     ")\n",
 992 |     "print(output)"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "cell_type": "code",
 997 |    "execution_count": null,
 998 |    "metadata": {},
 999 |    "outputs": [],
1000 |    "source": []
1001 |   }
1002 |  ],
1003 |  "metadata": {
1004 |   "kernelspec": {
1005 |    "display_name": "experiments-in-ai-UifnU4Ym-py3.11",
1006 |    "language": "python",
1007 |    "name": "python3"
1008 |   },
1009 |   "language_info": {
1010 |    "codemirror_mode": {
1011 |     "name": "ipython",
1012 |     "version": 3
1013 |    },
1014 |    "file_extension": ".py",
1015 |    "mimetype": "text/x-python",
1016 |    "name": "python",
1017 |    "nbconvert_exporter": "python",
1018 |    "pygments_lexer": "ipython3",
1019 |    "version": "3.11.9"
1020 |   }
1021 |  },
1022 |  "nbformat": 4,
1023 |  "nbformat_minor": 2
1024 | }
1025 | 


--------------------------------------------------------------------------------