├── .streamlit
└── config.toml
├── Gifs
├── arrow_small_new.gif
├── blue_grey_arrow.gif
└── boat_new.gif
├── LICENSE
├── README.md
├── openai.png
├── requirements.txt
├── streamlit_app.py
└── utils.py
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="light"
3 | #old
4 | #primaryColor="#18447c"
5 | #new
6 | primaryColor="#2BB5E8"
--------------------------------------------------------------------------------
/Gifs/arrow_small_new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tylerjrichards/GPT3-Dataset-Generator-V2/565b6a9104a2d92215971784a5307c6a050f8994/Gifs/arrow_small_new.gif
--------------------------------------------------------------------------------
/Gifs/blue_grey_arrow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tylerjrichards/GPT3-Dataset-Generator-V2/565b6a9104a2d92215971784a5307c6a050f8994/Gifs/blue_grey_arrow.gif
--------------------------------------------------------------------------------
/Gifs/boat_new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tylerjrichards/GPT3-Dataset-Generator-V2/565b6a9104a2d92215971784a5307c6a050f8994/Gifs/boat_new.gif
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🤖 GPT3 Dataset Generator
2 |
3 | ## About the app
4 |
5 | [](https://gpt3-dataset-generator.streamlit.app/)
6 |
7 | This app generates datasets using GPT3. It was developped by [Charly Wargnier](https://twitter.com/DataChaz) and [Tony Kipkemboi](https://twitter.com/_townee), as part of the ❄️ [Snowflake](https://www.snowflake.com/en/) Snowvation Hackathon.
8 |
9 |
15 |
16 | ## What is GPT-3?
17 |
18 | [GPT-3](https://en.wikipedia.org/wiki/GPT-3) is a large language generation model developed by [OpenAI](https://openai.com/) that can generate human-like text. It has a capacity of 175 billion parameters and is trained on a vast dataset of internet text. It can be used for tasks such as language translation, chatbot language generation, and content generation etc.
19 |
20 | ## 🎈 What is Streamlit?
21 |
22 | [Streamlit](https://streamlit.io) is an open-source Python library that allows users to create interactive, web-based data visualization and machine learning applications without the need for extensive web development knowledge.
23 |
24 | ## 📖 Resources
25 |
26 | - OpenAI
27 | - [OpenAI Playground](https://beta.openai.com/playground)
28 | - [OpenAI Documentation](https://beta.openai.com/docs)
29 | - Streamlit
30 | - [Documentation](https://docs.streamlit.io/)
31 | - [Gallery](https://streamlit.io/gallery)
32 | - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
33 | - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
34 | - Deploy your apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks
35 |
--------------------------------------------------------------------------------
/openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tylerjrichards/GPT3-Dataset-Generator-V2/565b6a9104a2d92215971784a5307c6a050f8994/openai.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | streamlit-pills
3 | openai
4 |
5 | streamlit-aggrid==0.2.2-2
6 |
7 | snowflake-connector-python==2.7.7
8 | snowflake-sqlalchemy
9 | sqlalchemy
10 |
11 | #For Postgres
12 | psycopg2-binary
--------------------------------------------------------------------------------
/streamlit_app.py:
--------------------------------------------------------------------------------
1 | # ----------------------Importing libraries----------------------
2 |
3 | import streamlit as st
4 | from streamlit_pills import pills
5 | import pandas as pd
6 | import openai
7 |
8 | # Imports for AgGrid
9 | from st_aggrid import AgGrid, GridUpdateMode, JsCode
10 | from st_aggrid.grid_options_builder import GridOptionsBuilder
11 |
12 | # ----------------------Importing utils.py----------------------
13 |
14 | # For Snowflake (from Tony's utils.py)
15 | import io
16 | from utils import (
17 | connect_to_snowflake,
18 | load_data_to_snowflake,
19 | load_data_to_postgres,
20 | connect_to_postgres,
21 | )
22 |
23 | # ----------------------Page config--------------------------------------
24 |
25 | st.set_page_config(page_title="GPT3 Dataset Generator", page_icon="🤖")
26 |
27 | # ----------------------Sidebar section--------------------------------
28 |
29 | # st.image(
30 | # "Gifs/header.gif",
31 | # )
32 |
33 | st.image("Gifs/boat_new.gif")
34 |
35 | c30, c31, c32 = st.columns([0.2, 0.1, 3])
36 |
37 | with c30:
38 |
39 | st.caption("")
40 |
41 | st.image("openai.png", width=60)
42 |
43 | with c32:
44 |
45 | st.title("GPT3 Dataset Generator")
46 |
47 | st.write(
48 | "This app generates datasets using GPT3. It was created for the ❄️ Snowflake Snowvation Hackathon"
49 | )
50 |
51 | tabMain, tabInfo, tabTo_dos = st.tabs(["Main", "Info", "To-do's"])
52 |
53 | with tabInfo:
54 | st.write("")
55 | st.write("")
56 |
57 | st.subheader("🤖 What is GPT-3?")
58 | st.markdown(
59 | "[GPT-3](https://en.wikipedia.org/wiki/GPT-3) is a large language generation model developed by [OpenAI](https://openai.com/) that can generate human-like text. It has a capacity of 175 billion parameters and is trained on a vast dataset of internet text. It can be used for tasks such as language translation, chatbot language generation, and content generation etc."
60 | )
61 |
62 | st.subheader("🎈 What is Streamlit?")
63 | st.markdown(
64 | "[Streamlit](https://streamlit.io) is an open-source Python library that allows users to create interactive, web-based data visualization and machine learning applications without the need for extensive web development knowledge"
65 | )
66 |
67 | st.write("---")
68 |
69 | st.subheader("📖 Resources")
70 | st.markdown(
71 | """
72 | - OpenAI
73 | - [OpenAI Playground](https://beta.openai.com/playground)
74 | - [OpenAI Documentation](https://beta.openai.com/docs)
75 | - Streamlit
76 | - [Documentation](https://docs.streamlit.io/)
77 | - [Gallery](https://streamlit.io/gallery)
78 | - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
79 | - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
80 | - Deploy your apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks
81 | """
82 | )
83 |
84 | with tabTo_dos:
85 |
86 | with st.expander("To-do", expanded=True):
87 | st.write(
88 | """
89 | - [p2] Currently, the results are displayed even if the submit button isn't pressed.
90 | - [p2] There is still an issue with the index where the first element from the JSON is not being displayed.
91 | - [Post Hackathon] To limit the number of API calls and costs, let's cap the maximum number - of results to 5. Alternatively, we can consider removing the free API key.
92 |
93 | """
94 | )
95 | st.write("")
96 |
97 | with st.expander("Done", expanded=True):
98 | st.write(
99 | """
100 | - [p2] Check if the Json file is working
101 | - [p2] On Github, remove any unused images and GIFs.
102 | - [p1] Add that for postgress - localhost is required
103 | - [p2] Rename the CSV and JSON as per the st-pills variable
104 | - [p2] Change the color of the small arrow
105 | - [p1] Adjust the size of the Gifs
106 | - Add a streamlit badge in the `ReadMe` file
107 | - Add the message "Please enter your API key or choose the `Free Key` option."
108 | - Include a `ReadMe` file
109 | - Add a section for the Snowflake credentials
110 | - Remove password from the Python file
111 | - Add screenshots to the `ReadMe` file
112 | - Include forms in the snowflake postgres section
113 | - Remove the hashed code in the Python file
114 | - Include additional information in the 'info' tab
115 | - p1] Fix the download issue by sorting it via session state
116 | - [p1] Make the dataframe from this app editable
117 | - Add more gifs to the app
118 | - Change the color scheme to Snowflake Blue
119 | - Include a section for Snowflake credentials
120 | - Change the colors of the arrows, using this tool (https://lottiefiles.com/lottie-to-gif/convert)
121 | - Try new prompts and implement the best ones
122 | - Add a config file for the color scheme
123 | - Include an option menu using this tool (https://github.com/victoryhb/streamlit-option-menu)
124 | - Display a message when the API key is not provided
125 | - Fix the arrow and rearrange the layout for the API key message
126 | - Check and improve the quality of the prompt output
127 | - Send the app to Tony and upload it to GitHub
128 | - Re-arrange the data on the sidebar
129 | - Change the colors of both gifs to match the overall color scheme
130 | - Add context about the app being part of the snowvation project
131 | - Add a button to convert the data to JSON format
132 | - Include the Snowflake logo
133 | - Add a submit button to block API calls unless pressed
134 | - Add a tab with additional information
135 | - Resize the columns in the st.form section
136 | - Add the ability to add the dataset to Snowflake
137 | - Create a section with pills, showcasing examples
138 | - Change the main emoji
139 | - Change the emoji in the tab (page_icon)
140 | - [INFO] Sort out the issue with credits
141 |
142 |
143 |
144 | """
145 | )
146 | st.write("")
147 |
148 | with st.expander("Not needed", expanded=True):
149 | st.write(
150 | """
151 | - Check index issue in readcsv (not an issue as I've changed the script)
152 | - Add the mouse gif (doesn't fit)
153 | - Ask Lukas - automatically resize the columns of a DataFrame
154 | """
155 | )
156 | st.write("")
157 |
158 | st.write("")
159 | st.write("")
160 | st.write("")
161 |
162 |
163 | with tabMain:
164 |
165 | key_choice = st.sidebar.radio(
166 | "",
167 | (
168 | "Your Key",
169 | "Free Key (capped)",
170 | ),
171 | horizontal=True,
172 | )
173 |
174 | if key_choice == "Your Key":
175 |
176 | API_Key = st.sidebar.text_input(
177 | "First, enter your OpenAI API key", type="password"
178 | )
179 |
180 | elif key_choice == "Free Key (capped)":
181 |
182 | API_Key = st.secrets["API_KEY"]
183 |
184 | image_arrow = st.sidebar.image(
185 | "Gifs/blue_grey_arrow.gif",
186 | )
187 |
188 | if key_choice == "Free Key (capped)":
189 |
190 | image_arrow.empty()
191 |
192 | else:
193 |
194 | st.write("")
195 |
196 | st.sidebar.caption(
197 | "No OpenAI API key? Get yours [here!](https://openai.com/blog/api-no-waitlist/)"
198 | )
199 | pass
200 |
201 | st.write("")
202 |
203 | c30, c31, c32 = st.columns([0.2, 0.1, 3])
204 |
205 | st.subheader("① Build your dataset")
206 |
207 | example = pills(
208 | "",
209 | [
210 | "Sci-fi Movies",
211 | "Animals",
212 | "Pop Songs",
213 | "POTUS's Twitter",
214 | "Blank",
215 | ],
216 | [
217 | "🍿",
218 | "🐎",
219 | "🎵",
220 | "🇺🇸",
221 | "👻",
222 | ],
223 | label_visibility="collapsed",
224 | )
225 |
226 | if "counter" not in st.session_state:
227 | st.session_state.counter = 0
228 |
229 | def increment():
230 | st.session_state.counter += 1
231 |
232 | if example == "Sci-fi Movies":
233 |
234 | with st.form("my_form"):
235 |
236 | text_input = st.text_input(
237 | "What is the topic of your dataset?", value="Sci-fi movies"
238 | )
239 |
240 | col1, col2, col3 = st.columns(3, gap="small")
241 |
242 | with col1:
243 | column_01 = st.text_input("1st column", value="Title")
244 |
245 | with col2:
246 | column_02 = st.text_input("2nd column", value="Year")
247 |
248 | with col3:
249 | column_03 = st.text_input("3rd column", value="PG rating")
250 |
251 | col1, col2 = st.columns(2, gap="medium")
252 |
253 | with col1:
254 | number = st.number_input(
255 | "How many rows do you want?",
256 | value=5,
257 | min_value=1,
258 | max_value=20,
259 | step=5,
260 | help="The maximum number of rows is 20.",
261 | )
262 |
263 | with col2:
264 | engine = st.radio(
265 | "GPT3 engine",
266 | (
267 | "Davinci",
268 | "Curie",
269 | "Babbage",
270 | ),
271 | horizontal=True,
272 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
273 | )
274 |
275 | if engine == "Davinci":
276 | engine = "davinci-instruct-beta-v3"
277 | elif engine == "Curie":
278 | engine = "curie-instruct-beta-v2"
279 | elif engine == "Babbage":
280 | engine = "babbage-instruct-beta"
281 |
282 | st.write("")
283 |
284 | submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
285 |
286 | elif example == "Animals":
287 |
288 | with st.form("my_form"):
289 |
290 | text_input = st.text_input(
291 | "What is the topic of your dataset?", value="Fastest animals on earth"
292 | )
293 |
294 | col1, col2, col3 = st.columns(3, gap="small")
295 |
296 | with col1:
297 | column_01 = st.text_input("1st column", value="Animal")
298 |
299 | with col2:
300 | column_02 = st.text_input("2nd column", value="Speed")
301 |
302 | with col3:
303 | column_03 = st.text_input("3rd column", value="Weight")
304 |
305 | col1, col2 = st.columns(2, gap="medium")
306 |
307 | with col1:
308 | number = st.number_input(
309 | "How many rows do you want?",
310 | value=5,
311 | min_value=1,
312 | max_value=20,
313 | step=5,
314 | help="The maximum number of rows is 50.",
315 | )
316 |
317 | with col2:
318 | engine = st.radio(
319 | "GPT3 engine",
320 | (
321 | "Davinci",
322 | "Curie",
323 | "Babbage",
324 | ),
325 | horizontal=True,
326 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
327 | )
328 |
329 | if engine == "Davinci":
330 | engine = "davinci-instruct-beta-v3"
331 | elif engine == "Curie":
332 | engine = "curie-instruct-beta-v2"
333 | elif engine == "Babbage":
334 | engine = "babbage-instruct-beta"
335 |
336 | st.write("")
337 |
338 | submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
339 |
340 | elif example == "Stocks":
341 |
342 | with st.form("my_form"):
343 |
344 | text_input = st.text_input(
345 | "What is the topic of your dataset?", value="Stocks"
346 | )
347 |
348 | col1, col2, col3 = st.columns(3, gap="small")
349 |
350 | with col1:
351 | column_01 = st.text_input("1st column", value="Ticker")
352 |
353 | with col2:
354 | column_02 = st.text_input("2nd column", value="Price")
355 |
356 | with col3:
357 | column_03 = st.text_input("3rd column", value="Exchange")
358 |
359 | col1, col2 = st.columns(2, gap="medium")
360 |
361 | with col1:
362 | number = st.number_input(
363 | "How many rows do you want?",
364 | value=5,
365 | min_value=1,
366 | max_value=20,
367 | step=5,
368 | help="The maximum number of rows is 50.",
369 | )
370 |
371 | with col2:
372 | engine = st.radio(
373 | "GPT3 engine",
374 | (
375 | "Davinci",
376 | "Curie",
377 | "Babbage",
378 | ),
379 | horizontal=True,
380 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
381 | )
382 |
383 | if engine == "Davinci":
384 | engine = "davinci-instruct-beta-v3"
385 | elif engine == "Curie":
386 | engine = "curie-instruct-beta-v2"
387 | elif engine == "Babbage":
388 | engine = "babbage-instruct-beta"
389 |
390 | st.write("")
391 |
392 | submitted = st.form_submit_button("Build my dataset! ✨", on_click=increment)
393 |
394 | elif example == "POTUS's Twitter":
395 |
396 | with st.form("my_form"):
397 |
398 | text_input = st.text_input(
399 | "What is the topic of your dataset?", value="POTUS's Twitter accounts"
400 | )
401 |
402 | col1, col2, col3 = st.columns(3, gap="small")
403 |
404 | with col1:
405 | column_01 = st.text_input("1st column", value="Name")
406 |
407 | with col2:
408 | column_02 = st.text_input("2nd column", value="Twitter handle")
409 |
410 | with col3:
411 | column_03 = st.text_input("3rd column", value="# of followers")
412 |
413 | col1, col2 = st.columns(2, gap="medium")
414 |
415 | with col1:
416 | number = st.number_input(
417 | "How many rows do you want?",
418 | value=5,
419 | min_value=1,
420 | max_value=20,
421 | step=5,
422 | help="The maximum number of rows is 50.",
423 | )
424 |
425 | with col2:
426 | engine = st.radio(
427 | "GPT3 engine",
428 | (
429 | "Davinci",
430 | "Curie",
431 | "Babbage",
432 | ),
433 | horizontal=True,
434 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
435 | )
436 |
437 | if engine == "Davinci":
438 | engine = "davinci-instruct-beta-v3"
439 | elif engine == "Curie":
440 | engine = "curie-instruct-beta-v2"
441 | elif engine == "Babbage":
442 | engine = "babbage-instruct-beta"
443 |
444 | st.write("")
445 |
446 | submitted = st.form_submit_button("Build my dataset! ✨")
447 |
448 | elif example == "Pop Songs":
449 |
450 | with st.form("my_form"):
451 |
452 | text_input = st.text_input(
453 | "What is the topic of your dataset?",
454 | value="Most famous songs of all time",
455 | )
456 |
457 | col1, col2, col3 = st.columns(3, gap="small")
458 |
459 | with col1:
460 | column_01 = st.text_input("1st column", value="Song")
461 |
462 | with col2:
463 | column_02 = st.text_input("2nd column", value="Artist")
464 |
465 | with col3:
466 | column_03 = st.text_input("3rd column", value="Genre")
467 |
468 | col1, col2 = st.columns(2, gap="medium")
469 |
470 | with col1:
471 | number = st.number_input(
472 | "How many rows do you want?",
473 | value=5,
474 | min_value=1,
475 | max_value=20,
476 | step=5,
477 | help="The maximum number of rows is 50.",
478 | )
479 |
480 | with col2:
481 | engine = st.radio(
482 | "GPT3 engine",
483 | (
484 | "Davinci",
485 | "Curie",
486 | "Babbage",
487 | ),
488 | horizontal=True,
489 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
490 | )
491 |
492 | if engine == "Davinci":
493 | engine = "davinci-instruct-beta-v3"
494 | elif engine == "Curie":
495 | engine = "curie-instruct-beta-v2"
496 | elif engine == "Babbage":
497 | engine = "babbage-instruct-beta"
498 |
499 | st.write("")
500 |
501 | submitted = st.form_submit_button("Build my dataset! ✨")
502 |
503 | elif example == "Blank":
504 |
505 | with st.form("my_form"):
506 |
507 | text_input = st.text_input("What is the topic of your dataset?", value="")
508 |
509 | col1, col2, col3 = st.columns(3, gap="small")
510 |
511 | with col1:
512 | column_01 = st.text_input("1st column", value="")
513 |
514 | with col2:
515 | column_02 = st.text_input("2nd column", value="")
516 |
517 | with col3:
518 | column_03 = st.text_input("3rd column", value="")
519 |
520 | col1, col2 = st.columns(2, gap="medium")
521 |
522 | with col1:
523 | number = st.number_input(
524 | "How many rows do you want?",
525 | value=5,
526 | min_value=1,
527 | max_value=20,
528 | step=5,
529 | help="The maximum number of rows is 50.",
530 | )
531 |
532 | with col2:
533 | engine = st.radio(
534 | "GPT3 engine",
535 | (
536 | "Davinci",
537 | "Curie",
538 | "Babbage",
539 | ),
540 | horizontal=True,
541 | help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.",
542 | )
543 |
544 | if engine == "Davinci":
545 | engine = "davinci-instruct-beta-v3"
546 | elif engine == "Curie":
547 | engine = "curie-instruct-beta-v2"
548 | elif engine == "Babbage":
549 | engine = "babbage-instruct-beta"
550 |
551 | st.write("")
552 |
553 | submitted = st.form_submit_button("Build my dataset! ✨")
554 |
555 | # ----------------------API key section----------------------------------
556 |
557 | number = number + 1
558 |
559 | if not API_Key and not submitted:
560 |
561 | st.stop()
562 |
563 | if not API_Key and submitted:
564 |
565 | st.info("Please enter your API key or choose the `Free Key` option.")
566 | st.stop()
567 |
568 | if st.session_state.counter >= 100:
569 |
570 | pass
571 |
572 | # ----------------------API key section----------------------------------
573 |
574 | if not submitted and st.session_state.counter == 0:
575 |
576 | c30, c31, c32 = st.columns([1, 0.01, 4])
577 |
578 | with c30:
579 |
580 | st.image("Gifs/arrow_small_new.gif")
581 | st.caption("")
582 |
583 | with c32:
584 |
585 | st.caption("")
586 | st.caption("")
587 |
588 | st.info(
589 | "Enter your dataset's criteria and click the button to generate it."
590 | )
591 |
592 | st.stop()
593 |
594 | elif st.session_state.counter > 0:
595 |
596 | c30, c31, c32 = st.columns([1, 0.9, 3])
597 |
598 | openai.api_key = API_Key
599 |
600 | # ----------------------API call section----------------------------------
601 |
602 | response = openai.Completion.create(
603 | model=engine,
604 | prompt=f"Please provide a list of the top {number} {text_input} along with the following information in a three-column spreadsheet: {column_01}, {column_02}, and {column_03}. The columns should be labeled as follows: {column_01} | {column_02} | {column_03}",
605 | temperature=0.5,
606 | max_tokens=1707,
607 | top_p=1,
608 | best_of=2,
609 | frequency_penalty=0,
610 | presence_penalty=0,
611 | )
612 |
613 | st.write("___")
614 |
615 | st.subheader("② Check the results")
616 |
617 | with st.expander("See the API Json output"):
618 | response
619 |
620 | output_code = response["choices"][0]["text"]
621 |
622 | # ----------------------Dataframe section----------------------------------
623 |
624 | # create pandas DataFrame from string
625 | df = pd.read_csv(io.StringIO(output_code), sep="|")
626 | # get the number of columns in the dataframe
627 | num_columns = len(df.columns)
628 |
629 | # create a list of column names
630 | column_names = ["Column {}".format(i) for i in range(1, num_columns + 1)]
631 |
632 | # add the header to the dataframe
633 | df.columns = column_names
634 |
635 | # specify the mapping of old column names to new column names
636 | column_mapping = {
637 | "Column 1": column_01,
638 | "Column 2": column_02,
639 | "Column 3": column_03,
640 | }
641 |
642 | # rename the columns of the dataframe
643 | df = df.rename(columns=column_mapping)
644 |
645 | st.write("")
646 |
647 | # ----------------------AgGrid section----------------------------------
648 |
649 | gd = GridOptionsBuilder.from_dataframe(df)
650 | gd.configure_pagination(enabled=True)
651 | gd.configure_default_column(editable=True, groupable=True)
652 | gd.configure_selection(selection_mode="multiple")
653 | gridoptions = gd.build()
654 | grid_table = AgGrid(
655 | df,
656 | gridOptions=gridoptions,
657 | update_mode=GridUpdateMode.SELECTION_CHANGED,
658 | theme="material",
659 | )
660 |
661 | # df
662 |
663 | # ----------------------Download section--------------------------------------
664 |
665 | c30, c31, c32, c33 = st.columns([1, 0.01, 1, 2.5])
666 |
667 | with c30:
668 |
669 | @st.cache
670 | def convert_df(df):
671 | return df.to_csv().encode("utf-8")
672 |
673 | csv = convert_df(df)
674 |
675 | st.download_button(
676 | label="Download CSV",
677 | data=csv,
678 | file_name=f"{example} dataset .csv",
679 | mime="text/csv",
680 | )
681 |
682 | with c32:
683 |
684 | json_string = df.to_json(orient="records")
685 |
686 | st.download_button(
687 | label="Download JSON",
688 | data=json_string,
689 | file_name="data_set_sample.json",
690 | mime="text/csv",
691 | )
692 |
693 | st.write("___")
694 |
695 | st.subheader("③ Load data to Databases")
696 |
697 | # Data to load to database(s)
698 | # df = pd.read_csv("philox-testset-1.csv")
699 |
700 | # Get user input for data storage option
701 | storage_option = st.radio(
702 | "Select data storage option:",
703 | (
704 | "Snowflake",
705 | "PostgreSQL",
706 | ),
707 | horizontal=True,
708 | )
709 |
710 | # Get user input for data storage option
711 | # Snowflake = st.selectbox(
712 | # "Select data storage option:", ["Snowflake", "Snowflake"]
713 | # )
714 |
715 | @st.cache(allow_output_mutation=True)
716 | def reset_form_fields():
717 | user = ""
718 | password = ""
719 | account = ""
720 | warehouse = ""
721 | database = ""
722 | schema = ""
723 | table = ""
724 | host = ""
725 | port = ""
726 |
727 | if storage_option == "Snowflake":
728 | st.subheader("`Enter Snowflake Credentials`👇")
729 | # Get user input for Snowflake credentials
730 |
731 | with st.form("my_form_db"):
732 |
733 | col1, col2 = st.columns(2, gap="small")
734 |
735 | with col1:
736 | user = st.text_input("Username:", value="TONY")
737 | with col2:
738 | password = st.text_input("Password:", type="password")
739 |
740 | with col1:
741 | account = st.text_input("Account:", value="jn27194.us-east4.gcp")
742 | with col2:
743 | warehouse = st.text_input("Warehouse:", value="NAH")
744 |
745 | with col1:
746 | database = st.text_input("Database:", value="SNOWVATION")
747 | with col2:
748 | schema = st.text_input("Schema:", value="PUBLIC")
749 |
750 | table = st.text_input("Table:")
751 |
752 | st.write("")
753 |
754 | submitted = st.form_submit_button("Load to Snowflake")
755 |
756 | # Load the data to Snowflake
757 | if submitted:
758 | # if st.button("Load data to Snowflake"):
759 | if (
760 | user
761 | and password
762 | and account
763 | and warehouse
764 | and database
765 | and schema
766 | and table
767 | ):
768 | conn = connect_to_snowflake(
769 | username=user,
770 | password=password,
771 | account=account,
772 | warehouse=warehouse,
773 | database=database,
774 | schema=schema,
775 | )
776 | if conn:
777 | load_data_to_snowflake(df, conn, table)
778 | else:
779 | st.warning("Please enter all Snowflake credentials")
780 |
781 | elif storage_option == "PostgreSQL":
782 | st.subheader("`Enter PostgreSQL Credentials`👇")
783 | st.error("Localhost only")
784 | # Get user input for PostgreSQL credentials
785 |
786 | with st.form("my_form_db"):
787 |
788 | col1, col2 = st.columns(2, gap="small")
789 |
790 | with col1:
791 | user = st.text_input("Username:", value="postgres")
792 | with col2:
793 | password = st.text_input("Password:", type="password")
794 | with col1:
795 | host = st.selectbox("Host:", ["localhost", "other"])
796 | if host == "other":
797 | host = st.text_input("Enter host:")
798 | with col2:
799 | port = st.text_input("Port:", value="5432")
800 | with col1:
801 | database = st.text_input("Database:", value="snowvation")
802 | with col2:
803 | table = st.text_input("Table:")
804 |
805 | st.write("")
806 |
807 | submitted = st.form_submit_button("Load to PostgreSQL")
808 |
809 | # Load the data to PostgreSQL
810 | # if st.button("Load data to PostgreSQL"):
811 | if submitted:
812 | if user and password and host and port and database and table:
813 | conn = connect_to_postgres(
814 | username=user,
815 | password=password,
816 | host=host,
817 | port=port,
818 | database=database,
819 | )
820 | if conn:
821 | load_data_to_postgres(df, conn, table)
822 | else:
823 | st.warning("Please enter all PostgreSQL credentials and table name")
824 |
825 | # Reset form fields when storage_option changes
826 | reset_form_fields()
827 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sqlalchemy import create_engine, exc, engine
3 | from snowflake.sqlalchemy import URL
4 | import streamlit as st
5 |
6 |
7 | def connect_to_snowflake(
8 | username: str,
9 | password: str,
10 | account: str,
11 | warehouse: str,
12 | database: str,
13 | schema: str,
14 | ) -> engine:
15 | """
16 | Connect to Snowflake using the specified credentials.
17 | Parameters:
18 | - username (str): Snowflake username
19 | - password (str): Snowflake password
20 | - account (str): Snowflake account name
21 | - warehouse (str): Snowflake warehouse name
22 | - database (str): Snowflake database name
23 | - schema (str): Snowflake schema name
24 | Returns:
25 | - Engine: SQLAlchemy Engine object for the connection
26 | """
27 |
28 | try:
29 | conn = create_engine(
30 | URL(
31 | user=username,
32 | password=password,
33 | account=account,
34 | warehouse=warehouse,
35 | database=database,
36 | schema=schema,
37 | )
38 | )
39 | return conn
40 | except exc.SQLAlchemyError as err:
41 | st.error(f"Error connecting to Snowflake: {err}")
42 | return None
43 |
44 |
45 | def load_data_to_snowflake(df: pd.DataFrame, conn: engine, table: str) -> None:
46 | """
47 | Load data from a CSV file into a table in Snowflake.
48 | Parameters:
49 | - filepath (str): Path to the CSV file
50 | - engine (Engine): SQLAlchemy Engine object for the connection
51 | - table (str): Snowflake table name
52 | Returns:
53 | - None
54 | """
55 | try:
56 | # Load data to Snowflake
57 | df.to_sql(table, conn, if_exists="replace", index=False)
58 | st.success("Data loaded to Snowflake successfully")
59 | st.snow()
60 | except Exception as err:
61 | print(f"Error loading data to Snowflake: {err}")
62 |
63 |
64 | def connect_to_postgres(
65 | username: str, password: str, host: str, port: str, database: str
66 | ) -> engine:
67 | """
68 | Connect to PostgreSQL using the specified credentials.
69 | Parameters:
70 | - username (str): PostgreSQL username
71 | - password (str): PostgreSQL password
72 | - host (str): PostgreSQL host name
73 | - port (str): PostgreSQL port
74 | - database (str): PostgreSQL database name
75 | Returns:
76 | - Engine: SQLAlchemy Engine object for the connection
77 | """
78 | try:
79 | conn = create_engine(
80 | f"postgresql://{username}:{password}@{host}:{port}/{database}"
81 | )
82 | return conn
83 | except exc.SQLAlchemyError as err:
84 | st.error(f"Error connecting to PostgreSQL: {err}")
85 | return None
86 |
87 |
88 | def load_data_to_postgres(df: pd.DataFrame, conn: engine, table: str) -> None:
89 | """
90 | Load data from a CSV file into a table in PostgreSQL.
91 | Parameters:
92 | - df (pd.DataFrame): DataFrame containing the data to load
93 | - conn (engine): SQLAlchemy Engine object for the connection
94 | - table (str): PostgreSQL table name
95 | Returns:
96 | - None
97 | """
98 | try:
99 | # Load data to PostgreSQL
100 | df.to_sql(table, conn, if_exists="replace", index=False)
101 | st.success("Data loaded to PostgreSQL successfully")
102 | st.balloons()
103 | except Exception as err:
104 | st.error(f"Error loading data to PostgreSQL: {err}")
105 |
106 |
107 | def main():
108 | st.title("Load Data to Databases")
109 |
110 | # Data to load to database(s)
111 | df = pd.read_csv("philox-testset-1.csv")
112 |
113 | # Get user input for data storage option
114 | storage_option = st.selectbox(
115 | "Select data storage option:", ["Snowflake", "PostgreSQL"]
116 | )
117 |
118 | @st.cache(allow_output_mutation=True)
119 | def reset_form_fields():
120 | user = ""
121 | password = ""
122 | account = ""
123 | warehouse = ""
124 | database = ""
125 | schema = ""
126 | table = ""
127 | host = ""
128 | port = ""
129 |
130 | if storage_option == "Snowflake":
131 | st.subheader("Enter Snowflake Credentials")
132 | # Get user input for Snowflake credentials
133 | user = st.text_input("Username:", value="TONY")
134 | password = st.text_input("Password:", type="password")
135 | account = st.text_input("Account:", value="jn27194.us-east4.gcp")
136 | warehouse = st.text_input("Warehouse:", value="NAH")
137 | database = st.text_input("Database:", value="SNOWVATION")
138 | schema = st.text_input("Schema:", value="PUBLIC")
139 | table = st.text_input("Table:")
140 |
141 | # Load the data to Snowflake
142 | if st.button("Load data to Snowflake"):
143 | if (
144 | user
145 | and password
146 | and account
147 | and warehouse
148 | and database
149 | and schema
150 | and table
151 | ):
152 | conn = connect_to_snowflake(
153 | username=user,
154 | password=password,
155 | account=account,
156 | warehouse=warehouse,
157 | database=database,
158 | schema=schema,
159 | )
160 | if conn:
161 | load_data_to_snowflake(df, conn, table)
162 | else:
163 | st.warning("Please enter all Snowflake credentials")
164 |
165 | elif storage_option == "PostgreSQL":
166 | st.subheader("Enter PostgreSQL Credentials")
167 | # Get user input for PostgreSQL credentials
168 | user = st.text_input("Username:", value="postgres")
169 | password = st.text_input("Password:", type="password")
170 | host = st.selectbox("Host:", ["localhost", "other"])
171 | if host == "other":
172 | host = st.text_input("Enter host:")
173 | port = st.text_input("Port:", value="5432")
174 | database = st.text_input("Database:", value="snowvation")
175 | table = st.text_input("Table:")
176 |
177 | # Load the data to PostgreSQL
178 | if st.button("Load data to PostgreSQL"):
179 | if user and password and host and port and database and table:
180 | conn = connect_to_postgres(
181 | username=user,
182 | password=password,
183 | host=host,
184 | port=port,
185 | database=database,
186 | )
187 | if conn:
188 | load_data_to_postgres(df, conn, table)
189 | else:
190 | st.warning("Please enter all PostgreSQL credentials and table name")
191 |
192 | # Reset form fields when storage_option changes
193 | reset_form_fields()
194 |
195 |
196 | if __name__ == "__main__":
197 | main()
198 |
--------------------------------------------------------------------------------