├── README.md
├── LICENSE
├── .gitignore
├── app.py
└── ai_voiceover_app.py


/README.md:
--------------------------------------------------------------------------------
 1 | # GPT4-turbo-with-vision-demo
 2 | Building Apps with [OpenAI](https://platform.openai.com/docs/overview) GPT-4-turbo with vision API and [Databutton](https://databutton.com/login?utm_source=github&utm_medium=avra&utm_article=gptvision)
 3 | 
 4 | - Live app - [here](https://databutton.com/v/now2nem0)
 5 | - Video tutorial and discussion - [here](https://youtu.be/rnXK2rMlqGo)
 6 | - *Blog post soon ...*
 7 | 
 8 |    
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Avra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # Author -> Avratanu Biswas 
  2 | # Youtube video ->
  3 | # Blog -> 
  4 | 
  5 | import streamlit as st
  6 | import base64
  7 | import databutton as db
  8 | 
  9 | from openai import OpenAI
 10 | 
 11 | # Function to encode the image to base64
 12 | def encode_image(image_file):
 13 |     return base64.b64encode(image_file.getvalue()).decode("utf-8")
 14 | 
 15 | 
 16 | st.set_page_config(page_title="Scientific Image Analyst", layout="centered", initial_sidebar_state="collapsed")
 17 | # Streamlit page setup
 18 | st.title("🧪 Scientific Image Analyst: `GPT-4 Turbo with Vision` 👀")
 19 | 
 20 | 
 21 | # Retrieve the OpenAI API Key from secrets
 22 | api_key = db.secrets.get(name="OPENAI_API_KEY")
 23 | 
 24 | # Initialize the OpenAI client with the API key
 25 | client = OpenAI(api_key=api_key)
 26 | 
 27 | # File uploader allows user to add their own image
 28 | uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
 29 | 
 30 | if uploaded_file:
 31 |     # Display the uploaded image
 32 |     with st.expander("Image", expanded = True):
 33 |         st.image(uploaded_file, caption=uploaded_file.name, use_column_width=True)
 34 | 
 35 | # Toggle for showing additional details input
 36 | show_details = st.toggle("Add details about the image", value=False)
 37 | 
 38 | if show_details:
 39 |     # Text input for additional details about the image, shown only if toggle is True
 40 |     additional_details = st.text_area(
 41 |         "Add any additional details or context about the image here:",
 42 |         disabled=not show_details
 43 |     )
 44 | 
 45 | # Button to trigger the analysis
 46 | analyze_button = st.button("Analyse the Scientific Image", type="secondary")
 47 | 
 48 | # Check if an image has been uploaded, if the API key is available, and if the button has been pressed
 49 | if uploaded_file is not None and api_key and analyze_button:
 50 | 
 51 |     with st.spinner("Analysing the image ..."):
 52 |         # Encode the image
 53 |         base64_image = encode_image(uploaded_file)
 54 |     
 55 |         # Optimized prompt for additional clarity and detail
 56 |         prompt_text = (
 57 |             "You are a highly knowledgeable scientific image analysis expert. "
 58 |             "Your task is to examine the following image in detail. "
 59 |             "Provide a comprehensive, factual, and scientifically accurate explanation of what the image depicts. "
 60 |             "Highlight key elements and their significance, and present your analysis in clear, well-structured markdown format. "
 61 |             "If applicable, include any relevant scientific terminology to enhance the explanation. "
 62 |             "Assume the reader has a basic understanding of scientific concepts."
 63 |             "Create a detailed image caption in bold explaining in short."
 64 |         )
 65 |     
 66 |         if show_details and additional_details:
 67 |             prompt_text += (
 68 |                 f"\n\nAdditional Context Provided by the User:\n{additional_details}"
 69 |             )
 70 |     
 71 |         # Create the payload for the completion request
 72 |         messages = [
 73 |             {
 74 |                 "role": "user",
 75 |                 "content": [
 76 |                     {"type": "text", "text": prompt_text},
 77 |                     {
 78 |                         "type": "image_url",
 79 |                         "image_url": f"data:image/jpeg;base64,{base64_image}",
 80 |                     },
 81 |                 ],
 82 |             }
 83 |         ]
 84 |     
 85 |         # Make the request to the OpenAI API
 86 |         try:
 87 |             # Without Stream
 88 |             
 89 |             # response = client.chat.completions.create(
 90 |             #     model="gpt-4-vision-preview", messages=messages, max_tokens=500, stream=False
 91 |             # )
 92 |     
 93 |             # Stream the response
 94 |             full_response = ""
 95 |             message_placeholder = st.empty()
 96 |             for completion in client.chat.completions.create(
 97 |                 model="gpt-4-vision-preview", messages=messages, 
 98 |                 max_tokens=1200, stream=True
 99 |             ):
100 |                 # Check if there is content to display
101 |                 if completion.choices[0].delta.content is not None:
102 |                     full_response += completion.choices[0].delta.content
103 |                     message_placeholder.markdown(full_response + "▌")
104 |             # Final update to placeholder after the stream ends
105 |             message_placeholder.markdown(full_response)
106 |     
107 |             # Display the response in the app
108 |             # st.write(response.choices[0].message.content)
109 |         except Exception as e:
110 |             st.error(f"An error occurred: {e}")
111 | else:
112 |     # Warnings for user action required
113 |     if not uploaded_file and analyze_button:
114 |         st.warning("Please upload an image.")
115 |     if not api_key:
116 |         st.warning("Please enter your OpenAI API key.")
117 | 


--------------------------------------------------------------------------------
/ai_voiceover_app.py:
--------------------------------------------------------------------------------
  1 | import databutton as db
  2 | import streamlit as st
  3 | import cv2  # pip install opencv-python
  4 | import base64
  5 | import tempfile
  6 | from openai import OpenAI
  7 | import os
  8 | import requests
  9 | 
 10 | # Retrieve the OpenAI API Key from secrets
 11 | api_key = db.secrets.get(name="OPENAI_API_KEY")
 12 | 
 13 | # Initialize the OpenAI client with the API key
 14 | client = OpenAI(api_key=api_key)
 15 | 
 16 | 
 17 | @st.cache_data
 18 | def video_to_base64_frames(video_buffer):
 19 |     """Convert video to a series of base64 encoded frames"""
 20 |     base64_frames = []
 21 |     # Read the file's bytes
 22 |     video_bytes = video_buffer.read()
 23 |     # Create a temporary file for the video
 24 |     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
 25 |         temp_video.write(video_bytes)
 26 |         temp_video_name = temp_video.name
 27 |     # Load the video from the temporary file
 28 |     video = cv2.VideoCapture(temp_video_name)
 29 |     # Read each frame from the video and encode it as base64
 30 |     while video.isOpened():
 31 |         success, frame = video.read()
 32 |         if not success:
 33 |             break
 34 |         _, buffer = cv2.imencode(".jpg", frame)
 35 |         base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
 36 |     video.release()
 37 |     # Clean up the temporary file
 38 |     try:
 39 |         os.remove(temp_video_name)
 40 |     except Exception as e:
 41 |         st.error(f"Error removing temporary file: {e}")
 42 |     return base64_frames
 43 | 
 44 | 
 45 | # Initialize Streamlit app
 46 | st.title("Turning Videos into Voiceovers using OpenAI models")
 47 | st.markdown(
 48 |     "#### [GPT-4 Vision](https://platform.openai.com/docs/guides/vision) and [TTS](https://platform.openai.com/docs/models/tts) APIs"
 49 | )
 50 | 
 51 | 
 52 | # Initialize session state variables
 53 | if "base64_frames" not in st.session_state:
 54 |     st.session_state.base64_frames = None
 55 | if "script" not in st.session_state:
 56 |     st.session_state.script = ""
 57 | 
 58 | # File uploader for video files
 59 | uploaded_video = st.file_uploader("Upload a video file", type=["mp4"])
 60 | if uploaded_video:
 61 |     with st.expander("Watch video", expanded=False):
 62 |         st.video(uploaded_video)
 63 | # Process video and generate script
 64 | if uploaded_video is not None and api_key:
 65 |     if st.button("Convert Video to Frames"):
 66 |         with st.spinner("Converting Video to Frames..."):
 67 |             # Convert video to base64 frames and store in session state
 68 |             st.session_state.base64_frames = video_to_base64_frames(uploaded_video)
 69 |             st.success(f"{len(st.session_state.base64_frames)} frames read.")
 70 |         # Display a sample frame from the video
 71 |         with st.expander("A Sample Frame", expanded=False):
 72 |             st.image(
 73 |                 base64.b64decode(st.session_state.base64_frames[0].encode("utf-8")),
 74 |                 caption="Sample Frame",
 75 |             )
 76 | 
 77 | # Button to generate script
 78 | if st.session_state.base64_frames and st.button("Generate Script"):
 79 |     PROMPT_MESSAGES = [
 80 |         {
 81 |             "role": "user",
 82 |             "content": [
 83 |                 "These are frames from a cooking show video. Generate a brief voiceover script in the style of a famous narrator, capturing the excitement and passion of holiday cooking. Only include the narration.",
 84 |                 *map(
 85 |                     lambda x: {"image": x, "resize": 768},
 86 |                     st.session_state.base64_frames[0::50],
 87 |                 ),
 88 |             ],
 89 |         },
 90 |     ]
 91 |     with st.spinner("Generating script..."):
 92 |         full_response = ""
 93 |         message_placeholder = st.empty()
 94 |         # Call OpenAI API to generate script based on the video frames
 95 |         for completion in client.chat.completions.create(
 96 |             model="gpt-4-vision-preview",
 97 |             messages=PROMPT_MESSAGES,
 98 |             max_tokens=500,
 99 |             stream=True,
100 |         ):
101 |             # Check if there is content to display
102 |             if completion.choices[0].delta.content is not None:
103 |                 full_response += completion.choices[0].delta.content
104 |                 message_placeholder.markdown(full_response + "▌")
105 |             st.session_state.script = full_response
106 |         with st.expander("Edit Generated Script:", expanded=False):
107 |             st.text_area("Generated Script", st.session_state.script, height=250)
108 | 
109 | # Button to generate audio
110 | if st.session_state.script and st.toggle("Generate Audio"):
111 |     with st.spinner("Generating audio..."):
112 |         response = requests.post(
113 |             "https://api.openai.com/v1/audio/speech",
114 |             headers={"Authorization": f"Bearer {api_key}"},
115 |             json={"model": "tts-1", "input": st.session_state.script, "voice": "fable"},
116 |         )
117 |         # Check the response status and handle audio generation
118 |         if response.status_code == 200:
119 |             audio_bytes = response.content
120 |             if len(audio_bytes) > 0:
121 |                 # Temporary file creation for the audio
122 |                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
123 |                     fp.write(audio_bytes)
124 |                     fp.seek(0)
125 |                     st.audio(fp.name, format="audio/mp3")
126 |                     with st.expander("Script", expanded=True):
127 |                         st.write(st.session_state.script)
128 |                     # Reset file pointer for download
129 |                     fp.seek(0)
130 |                     # Create a download button for the audio file
131 |                     st.download_button(
132 |                         label="Download audio",
133 |                         data=fp.read(),
134 |                         file_name="narration.mp3",
135 |                         mime="audio/mp3",
136 |                     )
137 |                 os.unlink(fp.name)  # Clean up the temporary file
138 | 


--------------------------------------------------------------------------------