├── .env.sample
├── .gitignore
├── README.md
├── examine_image.py
├── extract_newsletter_pdf.py
├── extract_podcast.py
├── extract_podcast_split.py
├── extract_youtube.py
├── files
    ├── citrini_24_trades.pdf
    └── ipo_pulse.png
├── requirements.txt
├── split_audio.sh
└── youtube_guru.py


/.env.sample:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY=your_google_api_key


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 
173 | 
174 | *.mp3
175 | *.mkv
176 | .DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multimodal Structured Extraction with Gemini 2.0 Flash and Google GenAI Python SDK 1.0
 3 | 
 4 | In this tutorial we extract structured data from a variety of sources, including an investment newsletter PDF, a podcast, and a YouTube video.
 5 | 
 6 | ## Setup
 7 | ```
 8 | python3 -m venv venv
 9 | source venv/bin/activate
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | ## Create a .env file in the root directory with your Gemini API key
14 | ```
15 | GOOGLE_API_KEY=your_google_api_key
16 | ```
17 | 
18 | ## Handy Commands
19 | 
20 | ### Download YouTube video
21 | ```
22 | yt-dlp https://www.youtube.com/youryoutubeurl
23 | ```
24 | 
25 | ## Download YouTube audio
26 | ```
27 | yt-dlp -x --audio-format mp3 https://www.youtube.com/youryoutubeurl
28 | ```
29 | 
30 | ### Split audio
31 | ```
32 | ffmpeg -i input.mp3 -f segment -segment_time 10 -c copy output_%03d.mp3
33 | ```
34 | 
35 | ### or use the shell script split_audio.sh
36 | ```
37 | ./split_audio.sh input.mp3
38 | ```
39 | 
40 | ### Extract audio from video
41 | ```
42 | ffmpeg -i input.mp4 -q:a 0 -map a output.mp3
43 | ```
44 | 
45 | 


--------------------------------------------------------------------------------
/examine_image.py:
--------------------------------------------------------------------------------
 1 | from google import genai
 2 | import PIL.Image
 3 | 
 4 | from dotenv import load_dotenv
 5 | load_dotenv()
 6 | 
 7 | # set up gemini
 8 | client = genai.Client()
 9 | model = "gemini-2.0-flash"
10 | 
11 | # first test, texting text
12 | # response = client.models.generate_content(model=model, contents="What is the stock symbol for Apple?")
13 | 
14 | # print(response.text)
15 | 
16 | # second test, texting image
17 | image = PIL.Image.open('files/ipo_pulse.png')
18 | 
19 | client = genai.Client()
20 | response = client.models.generate_content(model=model, contents=["According to this chart, when did the IPO market peak, when did it bottom out, and what does it look like for 2025?", image])
21 | 
22 | print(response.text)


--------------------------------------------------------------------------------
/extract_newsletter_pdf.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from google import genai
 3 | 
 4 | from google.genai import types
 5 | from pydantic import BaseModel
 6 | from typing import Optional
 7 | 
 8 | # load environment variables
 9 | from dotenv import load_dotenv
10 | load_dotenv()
11 | 
12 | # set up gemini
13 | client = genai.Client()
14 | model = "gemini-2.0-flash"
15 | 
16 | # set up pydantic models for companies and themes
17 | class Company(BaseModel):
18 |     name: str
19 |     public: bool
20 |     symbol: Optional[str]
21 |     long: Optional[bool]
22 | 
23 | class Theme(BaseModel):
24 |     name: str
25 | 
26 | 
27 | # upload the pdf file to gemini
28 | file_ref = client.files.upload(file="files/citrini_24_trades.pdf")
29 | 
30 | # prepare the prompt
31 | extract_themes_prompt = """
32 | Attached a a list of thematic trade ideas for 2024. Analyze the following text and extract all of the theme names discussed.
33 | """
34 | 
35 | # count the tokens in the prompt and file
36 | print(client.models.count_tokens(model=model, contents=[extract_themes_prompt, file_ref]))
37 | 
38 | # send the prompt and file to gemini
39 | result = client.models.generate_content(
40 |     model=model,
41 |     contents=[file_ref, extract_themes_prompt],
42 |     config=types.GenerateContentConfig(
43 |         response_mime_type="application/json", 
44 |         response_schema=list[Theme]
45 |     ),
46 | )
47 | 
48 | # # debug print for the raw result object
49 | print(result)
50 | 
51 | # # load and show the json structure of the result
52 | themes = json.loads(result.text)
53 | 
54 | print(themes)
55 | 
56 | # # loop through each theme and build a prompt to extract the companies mentioned in the theme
57 | for theme in themes:
58 |     print(theme["name"])
59 | 
60 |     extract_companies_prompt = f"""
61 |     Attached a a list of thematic trade ideas for 2025. I am only interested in the companies mentioned in the theme: {theme["name"]}. 
62 |     Extract all of the companies mentioned in the theme, including the company name, whether they are publicly traded or not, the ticker symbol associated with each company (if it is publicly traded), and whether the company was recommended to go long or short, True if long, False if short.
63 |     """
64 | 
65 |     result = client.models.generate_content(
66 |         model=model,
67 |         contents=[file_ref, extract_companies_prompt],
68 |         config=types.GenerateContentConfig(
69 |             response_mime_type="application/json", 
70 |             response_schema=list[Company]
71 |         ),
72 |     )
73 | 
74 |     companies= json.loads(result.text)
75 |     for company in companies:
76 |         print(company)
77 | 


--------------------------------------------------------------------------------
/extract_podcast.py:
--------------------------------------------------------------------------------
 1 | import json, time
 2 | from google import genai
 3 | 
 4 | from google.genai import types
 5 | from pydantic import BaseModel
 6 | from typing import Optional
 7 | 
 8 | # load environment variables
 9 | from dotenv import load_dotenv
10 | load_dotenv()
11 | 
12 | # set up gemini
13 | client = genai.Client()
14 | model = "gemini-2.0-flash"
15 | 
16 | podcast_file = client.files.upload(file='files/lex_fridman_podcast_dylan_patel.mp3')
17 | 
18 | print(podcast_file.name)
19 | 
20 | while podcast_file.state.name == "PROCESSING":
21 |     print("processing video...")
22 |     time.sleep(5)
23 |     print("podcast file name:")
24 |     print(podcast_file.name)
25 |     podcast_file = client.files.get(name=podcast_file.name)
26 | 
27 | # podcast_file = genai.get_file("files/m1bayt1bic9m")
28 | 
29 | class Prediction(BaseModel):
30 |     prediction: str
31 |     timeframe: str
32 | 
33 | 
34 | prompt = """
35 | I have attached the audio of a podcast. Give me a list of predictions made by the interviewee and the timeframe of the prediction.
36 | """
37 | 
38 | # count the tokens in the prompt and file
39 | print(client.models.count_tokens(model=model, contents=[podcast_file, prompt]))
40 | 
41 | # send the prompt and file to gemini
42 | result = client.models.generate_content(
43 |     model=model,
44 |     contents=[podcast_file, prompt], 
45 |     config=types.GenerateContentConfig(
46 |         response_mime_type="application/json", response_schema=list[Prediction]
47 |     )
48 | )
49 | 
50 | #print(result)
51 | response = json.loads(result.text)
52 | 
53 | print(json.dumps(response, indent=4))


--------------------------------------------------------------------------------
/extract_podcast_split.py:
--------------------------------------------------------------------------------
 1 | import json, time, os
 2 | from google import genai
 3 | 
 4 | from google.genai import types
 5 | from pydantic import BaseModel
 6 | from typing import Optional
 7 | 
 8 | # load environment variables
 9 | from dotenv import load_dotenv
10 | load_dotenv()
11 | 
12 | # set up gemini
13 | client = genai.Client()
14 | model = "gemini-2.0-flash"
15 | 
16 | # get the list of files in the split directory
17 | files = os.listdir('lex_fridman_podcast_dylan_patel_split')
18 | 
19 | # sort the files by name
20 | files.sort()
21 | 
22 | for file in files:
23 |     podcast_file = client.files.upload(file=f'lex_fridman_podcast_dylan_patel_split/{file}')
24 | 
25 |     print(podcast_file.name)
26 | 
27 |     while podcast_file.state.name == "PROCESSING":
28 |         print("processing video...")
29 |         time.sleep(5)
30 |         print("podcast file name:")
31 |         print(podcast_file.name)
32 |         podcast_file = client.files.get(name=podcast_file.name)
33 | 
34 |     # podcast_file = genai.get_file("files/m1bayt1bic9m")
35 | 
36 |     class Prediction(BaseModel):
37 |         prediction: str
38 |         timeframe: str
39 | 
40 | 
41 |     prompt = """
42 |     I have attached the audio of a podcast. Give me a list of predictions made by the interviewee and the timeframe of the prediction.
43 |     """
44 | 
45 |     # count the tokens in the prompt and file
46 |     print(client.models.count_tokens(model=model, contents=[podcast_file, prompt]))
47 | 
48 |     # send the prompt and file to gemini
49 |     result = client.models.generate_content(
50 |         model=model,
51 |         contents=[podcast_file, prompt], 
52 |         config=types.GenerateContentConfig(
53 |             response_mime_type="application/json", response_schema=list[Prediction]
54 |         )
55 |     )
56 | 
57 |     #print(result)
58 |     response = json.loads(result.text)
59 | 
60 |     print(json.dumps(response, indent=4))


--------------------------------------------------------------------------------
/extract_youtube.py:
--------------------------------------------------------------------------------
 1 | import json, time, os
 2 | from google import genai
 3 | 
 4 | from google.genai import types
 5 | from pydantic import BaseModel
 6 | from typing import Optional
 7 | 
 8 | # load environment variables
 9 | from dotenv import load_dotenv
10 | load_dotenv()
11 | 
12 | # set up gemini
13 | client = genai.Client()
14 | model = "gemini-2.0-flash"
15 | 
16 | video_file = client.files.upload(file='files/parttimelarry_youtube.mkv')
17 | 
18 | while video_file.state.name == "PROCESSING":
19 |     print("processing video...")
20 |     time.sleep(5)
21 |     print("video file name:")
22 |     print(video_file.name)
23 |     video_file = client.files.get(name=video_file.name)
24 | 
25 | #video_file = client.files.get(name="files/file123")
26 | 
27 | # set up pydantic models for companies and themes
28 | class Company(BaseModel):
29 |     name: str
30 |     bullish_or_bearish: str
31 |     why: str
32 | 
33 | 
34 | extract_ideas_from_video_prompt = """
35 | I have attached a YouTube video. Listen to the video and give me a list of stocks the YouTuber mentioned in the video, whether they were bullish or bearish on the stock, and why.
36 | """
37 | 
38 | # count the tokens in the prompt and file
39 | print(client.models.count_tokens(model=model, contents=[video_file, extract_ideas_from_video_prompt]))
40 | 
41 | # send the prompt and file to gemini
42 | result = client.models.generate_content(
43 |             model=model,
44 |             contents=[video_file, extract_ideas_from_video_prompt], 
45 |             config=types.GenerateContentConfig(
46 |                 response_mime_type="application/json", response_schema=list[Company]
47 |             )
48 |         )
49 | 
50 | print(json.loads(result.text))
51 | 


--------------------------------------------------------------------------------
/files/citrini_24_trades.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackingthemarkets/gemini-multimodal-structured-extraction/d870ce8a2ccd0d1cdaa9e6437fa5076fd625eaff/files/citrini_24_trades.pdf


--------------------------------------------------------------------------------
/files/ipo_pulse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hackingthemarkets/gemini-multimodal-structured-extraction/d870ce8a2ccd0d1cdaa9e6437fa5076fd625eaff/files/ipo_pulse.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google-genai
2 | python-dotenv
3 | yt-dlp
4 | pillow
5 | streamlit
6 | yfinance
7 | pandas
8 | altair


--------------------------------------------------------------------------------
/split_audio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if a filename is provided
 4 | if [ -z "$1" ]; then
 5 |     echo "Usage: $0 <input_file>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Get input file name
10 | INPUT_FILE="$1"
11 | 
12 | # Extract the file name without the extension
13 | BASENAME=$(basename "$INPUT_FILE" .mp3)
14 | 
15 | # Create an output directory to store the split files
16 | OUTPUT_DIR="${BASENAME}_split"
17 | mkdir -p "$OUTPUT_DIR"
18 | 
19 | # Run ffmpeg to split the file
20 | ffmpeg -i "$INPUT_FILE" -f segment -segment_time 900 -c copy "$OUTPUT_DIR/out%03d.mp3"
21 | 
22 | echo "Splitting complete. Files saved in $OUTPUT_DIR/"
23 | 


--------------------------------------------------------------------------------
/youtube_guru.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime, timedelta
  3 | import yt_dlp
  4 | import pandas as pd
  5 | import yfinance as yf
  6 | import streamlit as st
  7 | import altair as alt
  8 | 
  9 | # import gemini libraries and tools for extracting structured data
 10 | from google import genai
 11 | from google.genai import types
 12 | from pydantic import BaseModel
 13 | 
 14 | # load environment variables
 15 | from dotenv import load_dotenv
 16 | load_dotenv()
 17 | 
 18 | # set up gemini
 19 | client = genai.Client()
 20 | model = "gemini-1.5-pro"
 21 | 
 22 | 
 23 | class Prediction(BaseModel):
 24 |     who: str
 25 |     company_or_asset_class: str
 26 |     symbol: str
 27 |     timestamp: str
 28 |     prediction: str
 29 | 
 30 | 
 31 | class VideoData(BaseModel):
 32 |     who: str
 33 |     background: str
 34 |     predictions: list[Prediction]
 35 | 
 36 | prompt = """
 37 | Analyze this video.
 38 | 
 39 | Extract who is making predictions in the video. Summarize their background.
 40 | 
 41 | Extract stock picks and predictions. Focus on:
 42 | 
 43 | Price targets for specific assets (e.g., company stock, indexes, crypto) with predicted value and timeframe.
 44 | Macro predictions (e.g., interest rates, recessions) with event and timeline.
 45 | Bullish/bearish sentiment on companies, sectors, or asset classes.
 46 | General market calls (e.g., index movements, bull/bear runs).
 47 | Event-driven forecasts (e.g., earnings, policy changes).
 48 | Risky or contrarian bets (e.g., high-volatility assets, against-consensus calls).
 49 | 
 50 | For each prediction:
 51 | 
 52 | Quote/summarize the prediction and the reason for the prediction, the associated company, stock or asset symbol if possible, and the timestamp of the prediction.
 53 | """
 54 | 
 55 | st.title("YouTube Guru Analyzer")
 56 | 
 57 | youtube_url = st.text_input("Enter the YouTube URL")
 58 | 
 59 | if youtube_url:
 60 |     ydl_opts = {}
 61 |     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 62 |         info = ydl.extract_info(youtube_url, download=False)
 63 |         # makes the info json-serializable
 64 |         video_details = ydl.sanitize_info(info)
 65 | 
 66 |         video_date = datetime.strptime(video_details['upload_date'], "%Y%m%d")
 67 |         end_date = video_date + timedelta(days=365)
 68 |         start_date_str = video_date.strftime("%Y-%m-%d")
 69 |         end_date_str = end_date.strftime("%Y-%m-%d")
 70 | 
 71 |         st.subheader(video_details['title'])
 72 |         st.subheader(start_date_str)
 73 |         st.image(video_details['thumbnail'])
 74 |         
 75 | 
 76 |     with st.spinner("Analyzing predictions...", show_time=True):
 77 | 
 78 |         response = client.models.generate_content(
 79 |             model=model,
 80 |             contents=types.Content(
 81 |                 parts=[
 82 |                     types.Part(text=prompt),
 83 |                     types.Part(
 84 |                         file_data=types.FileData(file_uri=youtube_url)
 85 |                     )
 86 |                 ]
 87 |             ),
 88 |             config=types.GenerateContentConfig(
 89 |                 response_mime_type="application/json", response_schema=VideoData
 90 |             )
 91 |         )
 92 | 
 93 |     
 94 |         st.subheader(response.parsed.who)
 95 |         st.write(response.parsed.background)
 96 | 
 97 |         for prediction in response.parsed.predictions:
 98 |             st.subheader(f"{prediction.company_or_asset_class} - {prediction.symbol}")
 99 |             st.write(f"Discussed at: {prediction.timestamp}")
100 |             st.write(prediction.prediction)
101 |             
102 |             if prediction.symbol:
103 |                 stock = yf.Ticker(prediction.symbol)
104 | 
105 |                 history = stock.history(start=start_date_str, end=end_date_str, interval="1d")
106 |                 
107 | 
108 |                 # Prepare data for chart
109 |                 chart_data = pd.DataFrame(history["Close"]).reset_index()
110 |                 chart_data.columns = ['Date', 'Close']  # Rename columns for clarity
111 | 
112 |                 start_price = chart_data["Close"].iloc[0]  # First price
113 |                 end_price = chart_data["Close"].iloc[-1]   # Last price
114 |                 total_return = ((end_price / start_price) - 1) * 100
115 | 
116 |                 # display metrics in two columns
117 |                 col1, col2 = st.columns(2)
118 | 
119 |                 with col1:
120 |                     st.metric(label="Start Price", value=round(start_price, 2))
121 | 
122 |                 with col2:
123 |                     st.metric(label="1 Year Later", value=round(end_price, 2), delta=f"{total_return:.2f}%")
124 | 
125 |                 # Create Altair chart
126 |                 chart = alt.Chart(chart_data).mark_line().encode(
127 |                     x=alt.X('Date:T', axis=alt.Axis(format='%Y-%m-%d')),  # Full date format
128 |                     y='Close:Q'
129 |                 ).properties(
130 |                     width=600,
131 |                     height=400
132 |                 )
133 | 
134 |                 # Display in Streamlit
135 |                 st.altair_chart(chart, use_container_width=True)


--------------------------------------------------------------------------------