├── README.md ├── LICENSE ├── .gitignore ├── app.py └── ai_voiceover_app.py /README.md: -------------------------------------------------------------------------------- 1 | # GPT4-turbo-with-vision-demo 2 | Building Apps with [OpenAI](https://platform.openai.com/docs/overview) GPT-4-turbo with vision API and [Databutton](https://databutton.com/login?utm_source=github&utm_medium=avra&utm_article=gptvision) 3 | 4 | - Live app - [here](https://databutton.com/v/now2nem0) 5 | - Video tutorial and discussion - [here](https://youtu.be/rnXK2rMlqGo) 6 | - *Blog post soon ...* 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Avra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # Author -> Avratanu Biswas 2 | # Youtube video -> 3 | # Blog -> 4 | 5 | import streamlit as st 6 | import base64 7 | import databutton as db 8 | 9 | from openai import OpenAI 10 | 11 | # Function to encode the image to base64 12 | def encode_image(image_file): 13 | return base64.b64encode(image_file.getvalue()).decode("utf-8") 14 | 15 | 16 | st.set_page_config(page_title="Scientific Image Analyst", layout="centered", initial_sidebar_state="collapsed") 17 | # Streamlit page setup 18 | st.title("🧪 Scientific Image Analyst: `GPT-4 Turbo with Vision` 👀") 19 | 20 | 21 | # Retrieve the OpenAI API Key from secrets 22 | api_key = db.secrets.get(name="OPENAI_API_KEY") 23 | 24 | # Initialize the OpenAI client with the API key 25 | client = OpenAI(api_key=api_key) 26 | 27 | # File uploader allows user to add their own image 28 | uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) 29 | 30 | if uploaded_file: 31 | # Display the uploaded image 32 | with st.expander("Image", expanded = True): 33 | st.image(uploaded_file, caption=uploaded_file.name, use_column_width=True) 34 | 35 | # Toggle for showing additional details input 36 | show_details = st.toggle("Add details about the image", value=False) 37 | 38 | if show_details: 39 | # Text input for additional details about the image, shown only if toggle is True 40 | additional_details = st.text_area( 41 | "Add any additional details or context about the image here:", 42 | disabled=not show_details 43 | ) 44 | 45 | # Button to trigger the analysis 46 | analyze_button = st.button("Analyse the Scientific Image", type="secondary") 47 | 48 | # Check if an image has been uploaded, if the API key is available, and if the button has been pressed 49 | if uploaded_file is not None and api_key and analyze_button: 50 | 51 | with st.spinner("Analysing the image ..."): 52 | # Encode the image 53 | base64_image = encode_image(uploaded_file) 54 | 55 | # Optimized prompt for additional clarity and detail 56 | prompt_text = ( 57 | "You are a highly knowledgeable scientific image analysis expert. " 58 | "Your task is to examine the following image in detail. " 59 | "Provide a comprehensive, factual, and scientifically accurate explanation of what the image depicts. " 60 | "Highlight key elements and their significance, and present your analysis in clear, well-structured markdown format. " 61 | "If applicable, include any relevant scientific terminology to enhance the explanation. " 62 | "Assume the reader has a basic understanding of scientific concepts." 63 | "Create a detailed image caption in bold explaining in short." 64 | ) 65 | 66 | if show_details and additional_details: 67 | prompt_text += ( 68 | f"\n\nAdditional Context Provided by the User:\n{additional_details}" 69 | ) 70 | 71 | # Create the payload for the completion request 72 | messages = [ 73 | { 74 | "role": "user", 75 | "content": [ 76 | {"type": "text", "text": prompt_text}, 77 | { 78 | "type": "image_url", 79 | "image_url": f"data:image/jpeg;base64,{base64_image}", 80 | }, 81 | ], 82 | } 83 | ] 84 | 85 | # Make the request to the OpenAI API 86 | try: 87 | # Without Stream 88 | 89 | # response = client.chat.completions.create( 90 | # model="gpt-4-vision-preview", messages=messages, max_tokens=500, stream=False 91 | # ) 92 | 93 | # Stream the response 94 | full_response = "" 95 | message_placeholder = st.empty() 96 | for completion in client.chat.completions.create( 97 | model="gpt-4-vision-preview", messages=messages, 98 | max_tokens=1200, stream=True 99 | ): 100 | # Check if there is content to display 101 | if completion.choices[0].delta.content is not None: 102 | full_response += completion.choices[0].delta.content 103 | message_placeholder.markdown(full_response + "▌") 104 | # Final update to placeholder after the stream ends 105 | message_placeholder.markdown(full_response) 106 | 107 | # Display the response in the app 108 | # st.write(response.choices[0].message.content) 109 | except Exception as e: 110 | st.error(f"An error occurred: {e}") 111 | else: 112 | # Warnings for user action required 113 | if not uploaded_file and analyze_button: 114 | st.warning("Please upload an image.") 115 | if not api_key: 116 | st.warning("Please enter your OpenAI API key.") 117 | -------------------------------------------------------------------------------- /ai_voiceover_app.py: -------------------------------------------------------------------------------- 1 | import databutton as db 2 | import streamlit as st 3 | import cv2 # pip install opencv-python 4 | import base64 5 | import tempfile 6 | from openai import OpenAI 7 | import os 8 | import requests 9 | 10 | # Retrieve the OpenAI API Key from secrets 11 | api_key = db.secrets.get(name="OPENAI_API_KEY") 12 | 13 | # Initialize the OpenAI client with the API key 14 | client = OpenAI(api_key=api_key) 15 | 16 | 17 | @st.cache_data 18 | def video_to_base64_frames(video_buffer): 19 | """Convert video to a series of base64 encoded frames""" 20 | base64_frames = [] 21 | # Read the file's bytes 22 | video_bytes = video_buffer.read() 23 | # Create a temporary file for the video 24 | with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video: 25 | temp_video.write(video_bytes) 26 | temp_video_name = temp_video.name 27 | # Load the video from the temporary file 28 | video = cv2.VideoCapture(temp_video_name) 29 | # Read each frame from the video and encode it as base64 30 | while video.isOpened(): 31 | success, frame = video.read() 32 | if not success: 33 | break 34 | _, buffer = cv2.imencode(".jpg", frame) 35 | base64_frames.append(base64.b64encode(buffer).decode("utf-8")) 36 | video.release() 37 | # Clean up the temporary file 38 | try: 39 | os.remove(temp_video_name) 40 | except Exception as e: 41 | st.error(f"Error removing temporary file: {e}") 42 | return base64_frames 43 | 44 | 45 | # Initialize Streamlit app 46 | st.title("Turning Videos into Voiceovers using OpenAI models") 47 | st.markdown( 48 | "#### [GPT-4 Vision](https://platform.openai.com/docs/guides/vision) and [TTS](https://platform.openai.com/docs/models/tts) APIs" 49 | ) 50 | 51 | 52 | # Initialize session state variables 53 | if "base64_frames" not in st.session_state: 54 | st.session_state.base64_frames = None 55 | if "script" not in st.session_state: 56 | st.session_state.script = "" 57 | 58 | # File uploader for video files 59 | uploaded_video = st.file_uploader("Upload a video file", type=["mp4"]) 60 | if uploaded_video: 61 | with st.expander("Watch video", expanded=False): 62 | st.video(uploaded_video) 63 | # Process video and generate script 64 | if uploaded_video is not None and api_key: 65 | if st.button("Convert Video to Frames"): 66 | with st.spinner("Converting Video to Frames..."): 67 | # Convert video to base64 frames and store in session state 68 | st.session_state.base64_frames = video_to_base64_frames(uploaded_video) 69 | st.success(f"{len(st.session_state.base64_frames)} frames read.") 70 | # Display a sample frame from the video 71 | with st.expander("A Sample Frame", expanded=False): 72 | st.image( 73 | base64.b64decode(st.session_state.base64_frames[0].encode("utf-8")), 74 | caption="Sample Frame", 75 | ) 76 | 77 | # Button to generate script 78 | if st.session_state.base64_frames and st.button("Generate Script"): 79 | PROMPT_MESSAGES = [ 80 | { 81 | "role": "user", 82 | "content": [ 83 | "These are frames from a cooking show video. Generate a brief voiceover script in the style of a famous narrator, capturing the excitement and passion of holiday cooking. Only include the narration.", 84 | *map( 85 | lambda x: {"image": x, "resize": 768}, 86 | st.session_state.base64_frames[0::50], 87 | ), 88 | ], 89 | }, 90 | ] 91 | with st.spinner("Generating script..."): 92 | full_response = "" 93 | message_placeholder = st.empty() 94 | # Call OpenAI API to generate script based on the video frames 95 | for completion in client.chat.completions.create( 96 | model="gpt-4-vision-preview", 97 | messages=PROMPT_MESSAGES, 98 | max_tokens=500, 99 | stream=True, 100 | ): 101 | # Check if there is content to display 102 | if completion.choices[0].delta.content is not None: 103 | full_response += completion.choices[0].delta.content 104 | message_placeholder.markdown(full_response + "▌") 105 | st.session_state.script = full_response 106 | with st.expander("Edit Generated Script:", expanded=False): 107 | st.text_area("Generated Script", st.session_state.script, height=250) 108 | 109 | # Button to generate audio 110 | if st.session_state.script and st.toggle("Generate Audio"): 111 | with st.spinner("Generating audio..."): 112 | response = requests.post( 113 | "https://api.openai.com/v1/audio/speech", 114 | headers={"Authorization": f"Bearer {api_key}"}, 115 | json={"model": "tts-1", "input": st.session_state.script, "voice": "fable"}, 116 | ) 117 | # Check the response status and handle audio generation 118 | if response.status_code == 200: 119 | audio_bytes = response.content 120 | if len(audio_bytes) > 0: 121 | # Temporary file creation for the audio 122 | with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: 123 | fp.write(audio_bytes) 124 | fp.seek(0) 125 | st.audio(fp.name, format="audio/mp3") 126 | with st.expander("Script", expanded=True): 127 | st.write(st.session_state.script) 128 | # Reset file pointer for download 129 | fp.seek(0) 130 | # Create a download button for the audio file 131 | st.download_button( 132 | label="Download audio", 133 | data=fp.read(), 134 | file_name="narration.mp3", 135 | mime="audio/mp3", 136 | ) 137 | os.unlink(fp.name) # Clean up the temporary file 138 | --------------------------------------------------------------------------------