├── .env.sample ├── .gitignore ├── README.md ├── examine_image.py ├── extract_newsletter_pdf.py ├── extract_podcast.py ├── extract_podcast_split.py ├── extract_youtube.py ├── files ├── citrini_24_trades.pdf └── ipo_pulse.png ├── requirements.txt ├── split_audio.sh └── youtube_guru.py /.env.sample: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY=your_google_api_key -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | 173 | 174 | *.mp3 175 | *.mkv 176 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multimodal Structured Extraction with Gemini 2.0 Flash and Google GenAI Python SDK 1.0 3 | 4 | In this tutorial we extract structured data from a variety of sources, including an investment newsletter PDF, a podcast, and a YouTube video. 5 | 6 | ## Setup 7 | ``` 8 | python3 -m venv venv 9 | source venv/bin/activate 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | ## Create a .env file in the root directory with your Gemini API key 14 | ``` 15 | GOOGLE_API_KEY=your_google_api_key 16 | ``` 17 | 18 | ## Handy Commands 19 | 20 | ### Download YouTube video 21 | ``` 22 | yt-dlp https://www.youtube.com/youryoutubeurl 23 | ``` 24 | 25 | ## Download YouTube audio 26 | ``` 27 | yt-dlp -x --audio-format mp3 https://www.youtube.com/youryoutubeurl 28 | ``` 29 | 30 | ### Split audio 31 | ``` 32 | ffmpeg -i input.mp3 -f segment -segment_time 10 -c copy output_%03d.mp3 33 | ``` 34 | 35 | ### or use the shell script split_audio.sh 36 | ``` 37 | ./split_audio.sh input.mp3 38 | ``` 39 | 40 | ### Extract audio from video 41 | ``` 42 | ffmpeg -i input.mp4 -q:a 0 -map a output.mp3 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /examine_image.py: -------------------------------------------------------------------------------- 1 | from google import genai 2 | import PIL.Image 3 | 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | 7 | # set up gemini 8 | client = genai.Client() 9 | model = "gemini-2.0-flash" 10 | 11 | # first test, texting text 12 | # response = client.models.generate_content(model=model, contents="What is the stock symbol for Apple?") 13 | 14 | # print(response.text) 15 | 16 | # second test, texting image 17 | image = PIL.Image.open('files/ipo_pulse.png') 18 | 19 | client = genai.Client() 20 | response = client.models.generate_content(model=model, contents=["According to this chart, when did the IPO market peak, when did it bottom out, and what does it look like for 2025?", image]) 21 | 22 | print(response.text) -------------------------------------------------------------------------------- /extract_newsletter_pdf.py: -------------------------------------------------------------------------------- 1 | import json 2 | from google import genai 3 | 4 | from google.genai import types 5 | from pydantic import BaseModel 6 | from typing import Optional 7 | 8 | # load environment variables 9 | from dotenv import load_dotenv 10 | load_dotenv() 11 | 12 | # set up gemini 13 | client = genai.Client() 14 | model = "gemini-2.0-flash" 15 | 16 | # set up pydantic models for companies and themes 17 | class Company(BaseModel): 18 | name: str 19 | public: bool 20 | symbol: Optional[str] 21 | long: Optional[bool] 22 | 23 | class Theme(BaseModel): 24 | name: str 25 | 26 | 27 | # upload the pdf file to gemini 28 | file_ref = client.files.upload(file="files/citrini_24_trades.pdf") 29 | 30 | # prepare the prompt 31 | extract_themes_prompt = """ 32 | Attached a a list of thematic trade ideas for 2024. Analyze the following text and extract all of the theme names discussed. 33 | """ 34 | 35 | # count the tokens in the prompt and file 36 | print(client.models.count_tokens(model=model, contents=[extract_themes_prompt, file_ref])) 37 | 38 | # send the prompt and file to gemini 39 | result = client.models.generate_content( 40 | model=model, 41 | contents=[file_ref, extract_themes_prompt], 42 | config=types.GenerateContentConfig( 43 | response_mime_type="application/json", 44 | response_schema=list[Theme] 45 | ), 46 | ) 47 | 48 | # # debug print for the raw result object 49 | print(result) 50 | 51 | # # load and show the json structure of the result 52 | themes = json.loads(result.text) 53 | 54 | print(themes) 55 | 56 | # # loop through each theme and build a prompt to extract the companies mentioned in the theme 57 | for theme in themes: 58 | print(theme["name"]) 59 | 60 | extract_companies_prompt = f""" 61 | Attached a a list of thematic trade ideas for 2025. I am only interested in the companies mentioned in the theme: {theme["name"]}. 62 | Extract all of the companies mentioned in the theme, including the company name, whether they are publicly traded or not, the ticker symbol associated with each company (if it is publicly traded), and whether the company was recommended to go long or short, True if long, False if short. 63 | """ 64 | 65 | result = client.models.generate_content( 66 | model=model, 67 | contents=[file_ref, extract_companies_prompt], 68 | config=types.GenerateContentConfig( 69 | response_mime_type="application/json", 70 | response_schema=list[Company] 71 | ), 72 | ) 73 | 74 | companies= json.loads(result.text) 75 | for company in companies: 76 | print(company) 77 | -------------------------------------------------------------------------------- /extract_podcast.py: -------------------------------------------------------------------------------- 1 | import json, time 2 | from google import genai 3 | 4 | from google.genai import types 5 | from pydantic import BaseModel 6 | from typing import Optional 7 | 8 | # load environment variables 9 | from dotenv import load_dotenv 10 | load_dotenv() 11 | 12 | # set up gemini 13 | client = genai.Client() 14 | model = "gemini-2.0-flash" 15 | 16 | podcast_file = client.files.upload(file='files/lex_fridman_podcast_dylan_patel.mp3') 17 | 18 | print(podcast_file.name) 19 | 20 | while podcast_file.state.name == "PROCESSING": 21 | print("processing video...") 22 | time.sleep(5) 23 | print("podcast file name:") 24 | print(podcast_file.name) 25 | podcast_file = client.files.get(name=podcast_file.name) 26 | 27 | # podcast_file = genai.get_file("files/m1bayt1bic9m") 28 | 29 | class Prediction(BaseModel): 30 | prediction: str 31 | timeframe: str 32 | 33 | 34 | prompt = """ 35 | I have attached the audio of a podcast. Give me a list of predictions made by the interviewee and the timeframe of the prediction. 36 | """ 37 | 38 | # count the tokens in the prompt and file 39 | print(client.models.count_tokens(model=model, contents=[podcast_file, prompt])) 40 | 41 | # send the prompt and file to gemini 42 | result = client.models.generate_content( 43 | model=model, 44 | contents=[podcast_file, prompt], 45 | config=types.GenerateContentConfig( 46 | response_mime_type="application/json", response_schema=list[Prediction] 47 | ) 48 | ) 49 | 50 | #print(result) 51 | response = json.loads(result.text) 52 | 53 | print(json.dumps(response, indent=4)) -------------------------------------------------------------------------------- /extract_podcast_split.py: -------------------------------------------------------------------------------- 1 | import json, time, os 2 | from google import genai 3 | 4 | from google.genai import types 5 | from pydantic import BaseModel 6 | from typing import Optional 7 | 8 | # load environment variables 9 | from dotenv import load_dotenv 10 | load_dotenv() 11 | 12 | # set up gemini 13 | client = genai.Client() 14 | model = "gemini-2.0-flash" 15 | 16 | # get the list of files in the split directory 17 | files = os.listdir('lex_fridman_podcast_dylan_patel_split') 18 | 19 | # sort the files by name 20 | files.sort() 21 | 22 | for file in files: 23 | podcast_file = client.files.upload(file=f'lex_fridman_podcast_dylan_patel_split/{file}') 24 | 25 | print(podcast_file.name) 26 | 27 | while podcast_file.state.name == "PROCESSING": 28 | print("processing video...") 29 | time.sleep(5) 30 | print("podcast file name:") 31 | print(podcast_file.name) 32 | podcast_file = client.files.get(name=podcast_file.name) 33 | 34 | # podcast_file = genai.get_file("files/m1bayt1bic9m") 35 | 36 | class Prediction(BaseModel): 37 | prediction: str 38 | timeframe: str 39 | 40 | 41 | prompt = """ 42 | I have attached the audio of a podcast. Give me a list of predictions made by the interviewee and the timeframe of the prediction. 43 | """ 44 | 45 | # count the tokens in the prompt and file 46 | print(client.models.count_tokens(model=model, contents=[podcast_file, prompt])) 47 | 48 | # send the prompt and file to gemini 49 | result = client.models.generate_content( 50 | model=model, 51 | contents=[podcast_file, prompt], 52 | config=types.GenerateContentConfig( 53 | response_mime_type="application/json", response_schema=list[Prediction] 54 | ) 55 | ) 56 | 57 | #print(result) 58 | response = json.loads(result.text) 59 | 60 | print(json.dumps(response, indent=4)) -------------------------------------------------------------------------------- /extract_youtube.py: -------------------------------------------------------------------------------- 1 | import json, time, os 2 | from google import genai 3 | 4 | from google.genai import types 5 | from pydantic import BaseModel 6 | from typing import Optional 7 | 8 | # load environment variables 9 | from dotenv import load_dotenv 10 | load_dotenv() 11 | 12 | # set up gemini 13 | client = genai.Client() 14 | model = "gemini-2.0-flash" 15 | 16 | video_file = client.files.upload(file='files/parttimelarry_youtube.mkv') 17 | 18 | while video_file.state.name == "PROCESSING": 19 | print("processing video...") 20 | time.sleep(5) 21 | print("video file name:") 22 | print(video_file.name) 23 | video_file = client.files.get(name=video_file.name) 24 | 25 | #video_file = client.files.get(name="files/file123") 26 | 27 | # set up pydantic models for companies and themes 28 | class Company(BaseModel): 29 | name: str 30 | bullish_or_bearish: str 31 | why: str 32 | 33 | 34 | extract_ideas_from_video_prompt = """ 35 | I have attached a YouTube video. Listen to the video and give me a list of stocks the YouTuber mentioned in the video, whether they were bullish or bearish on the stock, and why. 36 | """ 37 | 38 | # count the tokens in the prompt and file 39 | print(client.models.count_tokens(model=model, contents=[video_file, extract_ideas_from_video_prompt])) 40 | 41 | # send the prompt and file to gemini 42 | result = client.models.generate_content( 43 | model=model, 44 | contents=[video_file, extract_ideas_from_video_prompt], 45 | config=types.GenerateContentConfig( 46 | response_mime_type="application/json", response_schema=list[Company] 47 | ) 48 | ) 49 | 50 | print(json.loads(result.text)) 51 | -------------------------------------------------------------------------------- /files/citrini_24_trades.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackingthemarkets/gemini-multimodal-structured-extraction/d870ce8a2ccd0d1cdaa9e6437fa5076fd625eaff/files/citrini_24_trades.pdf -------------------------------------------------------------------------------- /files/ipo_pulse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hackingthemarkets/gemini-multimodal-structured-extraction/d870ce8a2ccd0d1cdaa9e6437fa5076fd625eaff/files/ipo_pulse.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | google-genai 2 | python-dotenv 3 | yt-dlp 4 | pillow 5 | streamlit 6 | yfinance 7 | pandas 8 | altair -------------------------------------------------------------------------------- /split_audio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if a filename is provided 4 | if [ -z "$1" ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Get input file name 10 | INPUT_FILE="$1" 11 | 12 | # Extract the file name without the extension 13 | BASENAME=$(basename "$INPUT_FILE" .mp3) 14 | 15 | # Create an output directory to store the split files 16 | OUTPUT_DIR="${BASENAME}_split" 17 | mkdir -p "$OUTPUT_DIR" 18 | 19 | # Run ffmpeg to split the file 20 | ffmpeg -i "$INPUT_FILE" -f segment -segment_time 900 -c copy "$OUTPUT_DIR/out%03d.mp3" 21 | 22 | echo "Splitting complete. Files saved in $OUTPUT_DIR/" 23 | -------------------------------------------------------------------------------- /youtube_guru.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta 3 | import yt_dlp 4 | import pandas as pd 5 | import yfinance as yf 6 | import streamlit as st 7 | import altair as alt 8 | 9 | # import gemini libraries and tools for extracting structured data 10 | from google import genai 11 | from google.genai import types 12 | from pydantic import BaseModel 13 | 14 | # load environment variables 15 | from dotenv import load_dotenv 16 | load_dotenv() 17 | 18 | # set up gemini 19 | client = genai.Client() 20 | model = "gemini-1.5-pro" 21 | 22 | 23 | class Prediction(BaseModel): 24 | who: str 25 | company_or_asset_class: str 26 | symbol: str 27 | timestamp: str 28 | prediction: str 29 | 30 | 31 | class VideoData(BaseModel): 32 | who: str 33 | background: str 34 | predictions: list[Prediction] 35 | 36 | prompt = """ 37 | Analyze this video. 38 | 39 | Extract who is making predictions in the video. Summarize their background. 40 | 41 | Extract stock picks and predictions. Focus on: 42 | 43 | Price targets for specific assets (e.g., company stock, indexes, crypto) with predicted value and timeframe. 44 | Macro predictions (e.g., interest rates, recessions) with event and timeline. 45 | Bullish/bearish sentiment on companies, sectors, or asset classes. 46 | General market calls (e.g., index movements, bull/bear runs). 47 | Event-driven forecasts (e.g., earnings, policy changes). 48 | Risky or contrarian bets (e.g., high-volatility assets, against-consensus calls). 49 | 50 | For each prediction: 51 | 52 | Quote/summarize the prediction and the reason for the prediction, the associated company, stock or asset symbol if possible, and the timestamp of the prediction. 53 | """ 54 | 55 | st.title("YouTube Guru Analyzer") 56 | 57 | youtube_url = st.text_input("Enter the YouTube URL") 58 | 59 | if youtube_url: 60 | ydl_opts = {} 61 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 62 | info = ydl.extract_info(youtube_url, download=False) 63 | # makes the info json-serializable 64 | video_details = ydl.sanitize_info(info) 65 | 66 | video_date = datetime.strptime(video_details['upload_date'], "%Y%m%d") 67 | end_date = video_date + timedelta(days=365) 68 | start_date_str = video_date.strftime("%Y-%m-%d") 69 | end_date_str = end_date.strftime("%Y-%m-%d") 70 | 71 | st.subheader(video_details['title']) 72 | st.subheader(start_date_str) 73 | st.image(video_details['thumbnail']) 74 | 75 | 76 | with st.spinner("Analyzing predictions...", show_time=True): 77 | 78 | response = client.models.generate_content( 79 | model=model, 80 | contents=types.Content( 81 | parts=[ 82 | types.Part(text=prompt), 83 | types.Part( 84 | file_data=types.FileData(file_uri=youtube_url) 85 | ) 86 | ] 87 | ), 88 | config=types.GenerateContentConfig( 89 | response_mime_type="application/json", response_schema=VideoData 90 | ) 91 | ) 92 | 93 | 94 | st.subheader(response.parsed.who) 95 | st.write(response.parsed.background) 96 | 97 | for prediction in response.parsed.predictions: 98 | st.subheader(f"{prediction.company_or_asset_class} - {prediction.symbol}") 99 | st.write(f"Discussed at: {prediction.timestamp}") 100 | st.write(prediction.prediction) 101 | 102 | if prediction.symbol: 103 | stock = yf.Ticker(prediction.symbol) 104 | 105 | history = stock.history(start=start_date_str, end=end_date_str, interval="1d") 106 | 107 | 108 | # Prepare data for chart 109 | chart_data = pd.DataFrame(history["Close"]).reset_index() 110 | chart_data.columns = ['Date', 'Close'] # Rename columns for clarity 111 | 112 | start_price = chart_data["Close"].iloc[0] # First price 113 | end_price = chart_data["Close"].iloc[-1] # Last price 114 | total_return = ((end_price / start_price) - 1) * 100 115 | 116 | # display metrics in two columns 117 | col1, col2 = st.columns(2) 118 | 119 | with col1: 120 | st.metric(label="Start Price", value=round(start_price, 2)) 121 | 122 | with col2: 123 | st.metric(label="1 Year Later", value=round(end_price, 2), delta=f"{total_return:.2f}%") 124 | 125 | # Create Altair chart 126 | chart = alt.Chart(chart_data).mark_line().encode( 127 | x=alt.X('Date:T', axis=alt.Axis(format='%Y-%m-%d')), # Full date format 128 | y='Close:Q' 129 | ).properties( 130 | width=600, 131 | height=400 132 | ) 133 | 134 | # Display in Streamlit 135 | st.altair_chart(chart, use_container_width=True) --------------------------------------------------------------------------------