├── .gitignore
├── .python-version
├── README.md
├── __init__.py
├── data
    ├── halo_data_dump.dump
    └── zachs_posts.csv
├── pyproject.toml
├── schemas
    ├── actor_films.sql
    ├── game_details.sql
    ├── games.sql
    ├── player_seasons.sql
    ├── players.sql
    └── players_scd_table.sql
└── src
    ├── __init__.py
    ├── generate_dag_script.py
    ├── generate_documentation_script.py
    ├── generate_sql_script.py
    ├── langchain
        ├── __init__.py
        ├── database_query_example.py
        └── write_a_post_like_zach.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | credentials.json
  2 | token.json
  3 | output
  4 | __pycache__
  5 | src/__pycache__
  6 | .idea/
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | dump.sql
135 | data.dump
136 | 
137 | # Personal workspace files
138 | .idea/*
139 | .vscode/*
140 | .DS_Store
141 | 
142 | # uv
143 | uv.lock
144 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM-driven Data Engineering
 2 | 
 3 | ## Accounts to Follow
 4 | 
 5 | People
 6 | - [Li Yin](https://www.linkedin.com/in/li-yin-ai)
 7 | - [Chip Huyen](https://www.linkedin.com/in/chiphuyen/)
 8 | - [Zach Wilson](https://www.linkedin.com/in/eczachly)
 9 | 
10 | Libraries
11 | - [AdalFlow](https://github.com/SylphAI-Inc/AdalFlow)
12 | - [LangChain](https://github.com/langchain-ai/langchain)
13 | - [LlamaIndex](https://github.com/run-llama/llama_index)
14 | 
15 | ## Getting Started
16 | 
17 | Make an OpenAI account [here](https://platform.openai.com/) and then generate an API Key.
18 | For Day 4, you'll need a [Pinecone](https://www.pinecone.io) account and API key. 
19 | 
20 | - Day 1 (LLM-driven data engineering
21 |   - Lecture Video is [here](https://www.dataexpert.io/lesson/large-language-models-day-1-lecture)
22 |   - Lab video is [here](https://www.dataexpert.io/lesson/large-language-models-day-1-lab)
23 | - Day 2 (LLM dev with LangChain)
24 |   - Lecture Video is [here](https://www.dataexpert.io/lesson/large-language-models-day-2-lecture)
25 |   - Lab Video is [here](https://www.dataexpert.io/lesson/large-language-models-day-2-lab)
26 | - Day 3 (Using LLM to provide business value)
27 |   - Auto Feedback Repo [here](https://github.com/DataExpert-io/auto-feedback-example)
28 |   - Lecture Video is [here](https://www.dataexpert.io/lesson/machine-learning-day-1-lecture-v4)
29 |   - Lab Video is [here](https://www.dataexpert.io/lesson/machine-learning-day-1-lab-v4)
30 | - Day 4 (Creating ZachGPT with RAG)
31 |   - Vector Database Repo [here](https://github.com/DataExpert-io/vector-database-example)
32 |   - Lecture Video is [here](https://www.dataexpert.io/lesson/machine-learning-day-2-lecture-v4)
33 |   - Lab Video is [here](https://www.dataexpert.io/lesson/machine-learning-day-2-lab-v4)
34 | 
35 | ## Setup
36 | 
37 | This project use [PostgreSQL](https://www.postgresql.org/download/macosx/)
38 | 
39 | Store the API key as an environment variable like:
40 | `export OPENAI_API_KEY=<your_api_key>`
41 | Or set it in Windows
42 | 
43 | The easiest way to install the dependencies is uv. [Install](https://docs.astral.sh/uv/getting-started/installation/) it.
44 | 
45 | Run the command `uv sync` to install the python environment and all of the libraries under `.venv` folder.
46 | 
47 | You should configure your IDE to select the interpreter under the .venv folder, or activate it through the command on your terminal:
48 | ```sh
49 | source .venv/bin/activate
50 | ```
51 | 
52 | PS: If you don't want to use uv, run
53 | ```sh
54 | pip install .
55 | ```
56 | 
57 | ## Day 1 Lab
58 | 
59 | We'll be using the schemas from Dimensional Data Modeling Week 1 and generating the queries from the homework and labs except this time we'll do it via LLMs
60 | 
61 | ## Day 2 Lab
62 | 
63 | We'll be using Langchain to auto generate SQL queries for us based on tables and writing LinkedIn posts in Zach Wilson's voice
64 | ### Setup
65 | 
66 | If you are watching live, you will be given a cloud database URL to use.
67 | `export LANGCHAIN_DATABASE_URL=<value zach gives in Zoom>`
68 | 
69 | If you aren't watching live, you'll need to use the `halo_data_dump.dump` file located in the `data` folder
70 | 
71 | Running `pg_restore` with your local database should get you up and running pretty quickly. 
72 | 
73 | - example command, assuming you got Postgres up and running either via Homebrew or Docker:
74 |  - `pg_restore -h localhost -p 5432  -d postgres -U <your laptop username> halo_data_dump.dump`
75 | 
76 | ## Day 3 Lab
77 | 
78 | This lab leverages this [repo](https://github.com/DataExpert-io/auto-feedback-example)
79 | 
80 | ## Day 4 Lab
81 | This lab leverages this [repo](https://github.com/DataExpert-io/vector-database-example)
82 | 
83 | Add it to the environment `export PINECONE_API_KEY=<your pinecone API key>`
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataExpert-io/llm-driven-data-engineering/be4dc8ef785eef8f4b0213c5057d850874ba0e1b/__init__.py


--------------------------------------------------------------------------------
/data/halo_data_dump.dump:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataExpert-io/llm-driven-data-engineering/be4dc8ef785eef8f4b0213c5057d850874ba0e1b/data/halo_data_dump.dump


--------------------------------------------------------------------------------
/data/zachs_posts.csv:
--------------------------------------------------------------------------------
  1 | Date,ShareLink,ShareCommentary,SharedURL,MediaURL,Visibility
  2 | 2023-09-18 23:46:14,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7109684072821489665,"Seventy of my boot camp graduates qualified for three or six mentorship sessions by doing the work to get certified! ""
  3 | """"
  4 | ""I've been picky about the group of mentors because I want the best of the best to mentor my students. ""
  5 | """"
  6 | ""This is the list of rockstar mentors that we have so far. (I'll be getting one or two more since I didn't expect so many of my boot camp attendees to get certified!)""
  7 | """"
  8 | ""- Stephanie Murphy, senior data engineer at Tesla, she attended the boot camp ""
  9 | ""- Rimzim Thube, data engineer at Amazon, with over 10 years of experience""
 10 | ""- Bhargavi Reddy Dokuru, senior data engineer at Netflix""
 11 | ""- Ankit Biradar, data-focused software engineer at Uber ""
 12 | ""- Lakshmi Srivalli Kristam, senior data engineer at Grubhub, she was a boot camp mentor too""
 13 | ""- Lakshmi Malladi, data engineer at Salesforce, also ex-Meta, she attended the boot camp""
 14 | ""- Venkatesh Selvaraj, senior data engineer at Meta""
 15 | ""- Lenny A, data engineering leader with many years of experience at Meta and Amazon ""
 16 | ""- Francesco Quaratino, senior data engineer at Unite with over 11 years of experience (for our Europe/Africa attendees)""
 17 | ""- Ankit Shrivastava, senior software engineer at Uber (for our Asia attendees)""
 18 | """"
 19 | ""#dataengineering ",,,MEMBER_NETWORK
 20 | 2023-09-18 23:31:30,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7109680364901564416,Miguel was an amazing student! #dataengineering,,,MEMBER_NETWORK
 21 | 2023-09-18 18:45:52,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7109608483347468288,"I'm excited to announce that I hired my first full-time employee! ""
 22 | """"
 23 | ""JulieAnn is joining EcZachly Inc as a software engineer and business admin! ""
 24 | """"
 25 | ""JulieAnn was one of my students in the first boot camp and she brought a lot of really positive energy and organization skills. Without her, the v1 would've been a very disorganized mess. ""
 26 | """"
 27 | ""In the v2 boot camp, I hired her part-time as a community manager to help organize the Discord and curricula. She played a critical part in getting dataengineer.io to a great place!""
 28 | """"
 29 | ""For the v3 boot camp that will start in early November, we'll be organizing and creating to level up the quality even higher! ""
 30 | """"
 31 | ""Really excited to have you on the team JulieAnn! ""
 32 | """"
 33 | ""#softwarengineering ""
 34 | ""#dataengineering ",,,MEMBER_NETWORK
 35 | 2023-09-17 21:13:35,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7109283267106795520,"Did Bill Inmon and Ralph Kimball ever consider cage fighting like Mark Zuckerburg and Elon Musk over who had the better data warehouse methodology? ""
 36 | """"
 37 | ""I asked the man himself in my boot camp. ""
 38 | """"
 39 | ""I hope you enjoy this clip! ""
 40 | """"
 41 | ""#dataengineering ""
 42 | ""#dataarchitecture ",,,MEMBER_NETWORK
 43 | 2023-09-17 18:44:43,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7109245803503775744,"Here’s two hours of free data engineering content on LLM-driven data engineering. Code base and slides linked too. ""
 44 | """"
 45 | ""The lecture:""
 46 | """"
 47 | """"
 48 | ""https://lnkd.in/gaCs8NDz ""
 49 | """"
 50 | ""The lab: ""
 51 | """"
 52 | ""https://lnkd.in/g3nSPWq8""
 53 | """"
 54 | ""I’ll be filming two more hours this Thursday as well! You can register to attend that session here: ""
 55 | """"
 56 | ""https://lnkd.in/gg3MH4vR""
 57 | """"
 58 | """"
 59 | ""#dataengineering",,,MEMBER_NETWORK
 60 | 2023-09-15 23:30:12,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7108592874878889985,"Just finished adding certifications to my boot camp platform! Over 70% of my students in the v2 boot camp got the attendee-level certification which required that you attended at least 70% of the lectures! ""
 61 | """"
 62 | ""Going to be sharing the people who got the excellence-level certification which is attendee + all the homework soon! Grading stuff ferociously right now! ""
 63 | """"
 64 | ""#dataengineering",,,MEMBER_NETWORK
 65 | 2023-09-15 23:27:07,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7108592096839663616,"Victor was a really great student! Glad to see he got certified! ""
 66 | """"
 67 | ""#dataengineering",,,MEMBER_NETWORK
 68 | 2023-09-14 19:20:18,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7108167598000807936,"The engineer who asks a stupid question looks dumb for a second. ""
 69 | ""The engineer who doesn’t ask stupid questions looks dumb for a lifetime ""
 70 | """"
 71 | ""#softwareengineering",,,MEMBER_NETWORK
 72 | 2023-09-14 16:55:17,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7108131101629452288,"How will LLMs disrupt different data engineering tasks? ""
 73 | """"
 74 | ""Here’s a scatter plot of almost every data engineering task based on how technical vs soft skill it is and how tactical vs strategic it is.  ""
 75 | """"
 76 | ""The main takeaways from this should be: ""
 77 | """"
 78 | ""- tactical + technical tasks are going to be disrupted a lot""
 79 | """"
 80 | ""- strategic + soft skill tasks are safe ""
 81 | """"
 82 | ""- LLMs + agents will solve the pain of oncall for 80-90% of failures""
 83 | """"
 84 | ""- just knowing SQL, Python and Spark puts your job at risk ""
 85 | """"
 86 | """"
 87 | ""What would you add or change on this chart? Any additional takeaways? ""
 88 | """"
 89 | ""#dataengineering",,,MEMBER_NETWORK
 90 | 2023-09-13 21:40:48,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107840567920795648,"Clement has the wisdom here. I’m glad I followed my heart in March! The future is so bright! ""
 91 | """"
 92 | ""#mentalhealth",,,MEMBER_NETWORK
 93 | 2023-09-13 01:49:30,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107540764926074880,"The most important lesson I learned as a software engineer entrepreneur this year. ""
 94 | """"
 95 | ""Write code to facilitate business not to generate it! ""
 96 | """"
 97 | ""#softwareengineering",,,MEMBER_NETWORK
 98 | 2023-09-13 00:01:19,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107513541049339904,"The number of people DMing me about quitting their big tech jobs is so high right now! ""
 99 | ""Let’s go!!! Big tech exodus! ""
100 | """"
101 | ""#mentalhealth",,,MEMBER_NETWORK
102 | 2023-09-12 20:32:43,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107461043034931200,"I started my Substack newsletter on June 15th. The 90 days since then have been wild. ""
103 | """"
104 | ""Went from 0 paid subs to 117. ""
105 | ""Went from 9300 free subs to 21700. ""
106 | """"
107 | ""Key lessons: ""
108 | """"
109 | ""- my articles on data modeling and SQL interviews were my most successful. Providing new value is the most important thing! ""
110 | """"
111 | ""- partner with people to grow! ""
112 | ""I partnered with Alex Xu, Benjamin Rogojan, Ananth Packkildurai,  Sarah Floris, MS, and Ryan Peterman. We recommend each other and that brought in an additional 5,000 subs!!""
113 | """"
114 | ""#dataengineering",,,MEMBER_NETWORK
115 | 2023-09-12 18:52:44,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107435884483907584,"Finally someone seriously taking on Great Expectations! We can and must do better!""
116 | """"
117 | ""Congrats, 🎯 Mark Freeman II! ""
118 | """"
119 | ""#dataengineering",,,MEMBER_NETWORK
120 | 2023-09-12 03:25:46,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7107202604723535872,"Li is a highly technical founder who knows AI extremely well. this is an incredible opportunity!""
121 | """"
122 | ""#machinelearning",,,MEMBER_NETWORK
123 | 2023-09-11 01:32:37,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106811740004249600,"The data engineering SQL interview is the most common interview in big tech! ""
124 | """"
125 | ""At most companies, you end up getting asked SQL questions for about two hours. ""
126 | """"
127 | ""There are two rounds to be aware of:""
128 | ""- The screener round""
129 | ""Where they test that you have the fundamental knowledge of SQL and how to write code""
130 | ""- The onsite round""
131 | ""Where they test your depth of SQL and how to optimize with things like indexes and minimizing table scans!""
132 | """"
133 | ""I wrote a free Substack article that goes into much more detail about how to pass these interviews here: https://lnkd.in/g_m9RZWH""
134 | """"
135 | ""#dataengineering ",,,MEMBER_NETWORK
136 | 2023-09-10 23:39:18,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106783223489200129,"Vitali completed the grind of v2 of EcZachly Inc’s boot camp! ""
137 | """"
138 | ""#dataengineering",,,MEMBER_NETWORK
139 | 2023-09-10 21:11:55,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106746135200858112,Great post by Li! Make sure to find breaks for your #mentalhealth while you're building your dreams! ,,,MEMBER_NETWORK
140 | 2023-09-10 18:32:22,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106705980750495744,"My boot camp attendees who meet the attendee certification bar will be paired with a data engineering mentor who has worked in big tech at least 4 years to help them with interview prep, referrals, resume review, and anything else the boot camp attendee wants. ""
141 | """"
142 | ""The mentors get paid from the boot camp tuition. Combined track students get six mentorship sessions and single track students get three mentorship sessions! ""
143 | """"
144 | ""Six weeks of learning data engineering from me isn’t enough to get you to success. ""
145 | """"
146 | ""Finding mentors who can help you and establish long term success is the ultimate goal of the boot camps! ""
147 | """"
148 | ""#dataengineering",,,MEMBER_NETWORK
149 | 2023-09-10 02:51:20,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106469161693822976,"My boot camp attendees were ENGAGED! ""
150 | """"
151 | ""Kyle Dufrane attended 99.9% of the boot camp, he missed only 3 minutes of the 60+ hours of live content! ""
152 | """"
153 | ""Jade Nguyen, Joseph Corrado, Lakshmi Malladi, Rushitaa Dattuluri attended 99.8% of the boot camp! ""
154 | """"
155 | ""In total, about 60% of my students will meet the certification bar! ""
156 | """"
157 | ""#dataengineering",,,MEMBER_NETWORK
158 | 2023-09-09 16:22:50,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7106310995047849984,"Zach promoted from junior to staff in 4 years""
159 | ""Ryan promoted from junior to staff in 3 years ""
160 | """"
161 | ""I have a feeling it’ll happen eventually where Meta is promoting new grads to staff engineer in one year after joining.""
162 | """"
163 | ""It’s wild how quickly a good manager and mentor can accelerate your career! ""
164 | """"
165 | ""#softwareengineering",,,MEMBER_NETWORK
166 | 2023-09-08 16:50:09,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7105955482468552705,"When I was 20, I had a dream of becoming a mathematics professor.  I applied to graduate schools, had a perfect quant GRE score, and I was so excited to study.  ""
167 | """"
168 | ""I abandoned the dream to study data science in industry instead.""
169 | """"
170 | ""Going back to school has been on my mind a lot recently. I yearn for the depth and rigor that academics bring. ""
171 | """"
172 | ""#datascience",,,MEMBER_NETWORK
173 | 2023-09-07 20:13:02,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7105644151701962752,"My startup is averaging $50k/month since I started it in March""
174 | """"
175 | ""Here’s the tech stack I use: ""
176 | """"
177 | ""- Languages: TypeScript, SQL""
178 | ""- Frontend: NextJS""
179 | ""- Backend: ExpressJS""
180 | ""- Database: Postgres ""
181 | ""- Payments: Stripe ""
182 | ""- Content: S3 and CloudFront ""
183 | ""- Emails: SparkPost""
184 | ""- Platform: Heroku ""
185 | ""- Cloud bill: $400/month ""
186 | ""- Experiments: Statsig""
187 | ""- Logging: Kafka""
188 | """"
189 | ""Future integrations: ""
190 | """"
191 | ""- Analytics engineering learning platform: Trino and Tabular ""
192 | ""- Data engineering learning platform: Spark on Databricks and Iceberg ""
193 | ""- “leetcode for Spark” platform using Spark and Iceberg as well ""
194 | ""- Data engineering mentor matching platform using machine learning with PyTorch I think. Maybe something I can buy instead though ""
195 | """"
196 | ""What else should I be building with my startup and team of engineers? ""
197 | """"
198 | ""#SoftwareEngineering",,,MEMBER_NETWORK
199 | 2023-09-07 19:55:49,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7105639818700754944,"Amazing post by Ryan about being a good tech lead! ""
200 | """"
201 | ""#softwareengineering",,,MEMBER_NETWORK
202 | 2023-09-07 19:05:30,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7105627156159885312,"Data engineering tooling has different groups of stacks depending on the company. ""
203 | """"
204 | ""- the big tech company stack ""
205 | """"
206 | ""- compute: Spark ""
207 | ""- Orchestration: Airflow (or similar)""
208 | ""- data quality: custom built ""
209 | ""- serving layer: Druid""
210 | ""- storage: Iceberg + S3 ""
211 | """"
212 | ""- the mid-sized company stack ""
213 | """"
214 | ""- compute: Snowflake/BigQuery ""
215 | ""- orchestration: Airflow or Fivetran or Informatica""
216 | ""- data quality: Great expectations and DBT ""
217 | ""- serving layer: Tableau extracts or Druid ""
218 | ""- storage: Snowflake/BigQuery ""
219 | """"
220 | ""- the startup stack ""
221 | """"
222 | ""- compute: Postgres ""
223 | ""- orchestration: CRON""
224 | ""- data quality: skipped ""
225 | ""- serving layer: SQL queries in Postgres""
226 | ""- storage: Postgres""
227 | """"
228 | ""What would you change in these stacks?""
229 | """"
230 | ""#dataengineering",,,MEMBER_NETWORK
231 | 2023-09-07 02:15:17,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7105372928489263104,"Let’s go!!! More big tech influencers leaving to found their own things!""
232 | ""Congrats Cassie!!""
233 | """"
234 | ""#datascience",,,MEMBER_NETWORK
235 | 2023-09-06 19:42:36,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7105274107633537024,"Data architect is the next step after data engineer on the technical ladder.""
236 | """"
237 | ""What big questions should you be able to answer as a data architect?""
238 | """"
239 | ""- should our pipelines be streaming or batch?""
240 | ""Having a firm understanding of the trade offs of lambda (streaming + batch) versus kappa (streaming only) architecture is a key thing to being a great data architect. ""
241 | """"
242 | ""- how should our master data be modeled? ""
243 | ""This bucket is complex and has a few competing ideologies between Kimball data modeling, Inmon data modeling and one big table (OBT) data modeling. Each of these ideologies have trade offs that are too long to discuss in this LinkedIn post. ""
244 | """"
245 | ""- what data stores should we use for serving our data? ""
246 | ""Technology selection is another critical component. Betting everything on Snowflake or Spark is a losing battle. Understanding low latency stores like Druid, Memcached and Redis will serve you well. Also know analytical DBs like CouchDB and DuckDB. ""
247 | """"
248 | ""- how do we create processes to ensure data quality across all our pipelines ""
249 | ""Processes like spec review, design discussions, and data validation will greatly level up your data. As a data architect you should be flexing your leadership skills to get these adopted across your company. ""
250 | """"
251 | """"
252 | ""What other skills should a data architect know?""
253 | """"
254 | ""#dataengineering",,,MEMBER_NETWORK
255 | 2023-09-05 20:36:04,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7104925171936890880,"I took a refreshing phone break this past week and it has greatly improved my outlook and creativity! ""
256 | """"
257 | ""Those 20 minutes on Saturday were to contact my family to let them know I wasn’t dying  in the mud at burning man. ""
258 | """"
259 | ""I’m excited to bring the new found energy and improved mental health back into my data engineering business!""
260 | """"
261 | ""#mentalhealth ""
262 | ""#dataengineering",,,MEMBER_NETWORK
263 | 2023-08-26 18:05:12,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7101263325937766400,"Things I do before bedtime to calm my anxious mind and get more restful sleep ""
264 | """"
265 | ""- yoga""
266 | ""The long stretchy poses are great to invite sleepiness and calm. Supported fish pose with a block is incredible for the lower back. Sleeping pigeon and thread the needle are two other great poses to loosen the shoulders and legs. ""
267 | """"
268 | ""- no phone a few hours before bed ""
269 | ""Blue light simulates sunlight and tricks the brain into thinking it’s day time. Quit messing up your brain into thinking it’s day time at 1 AM by scrolling Instagram Reels. ""
270 | """"
271 | ""- a hot shower than ends cold ""
272 | ""I start my shower off hot and the last 2 minutes I end it cold. The quick shift and need for my body to warm up helps invite sleep ""
273 | """"
274 | ""- avoid taking melatonin too many nights ""
275 | ""Take melatonin at most once or twice a week. Don’t interfere with your bodies natural sleep processes.""
276 | """"
277 | ""#mentalhealth",,,MEMBER_NETWORK
278 | 2023-08-25 22:56:43,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100974300840529920,"Julio learned a lot from the boot camp!""
279 | """"
280 | ""#dataengineering",,,MEMBER_NETWORK
281 | 2023-08-25 18:03:04,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100900403222904833,"Benjamin puts out some of the best data engineering content in YouTube out there!""
282 | """"
283 | ""Give him a follow!""
284 | """"
285 | ""#dataengineering",,,MEMBER_NETWORK
286 | 2023-08-25 11:11:58,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100796944272293888,"22 lectures, 20 labs, 12 Q&As, 6 speaker sessions with 12 speakers! ""
287 | """"
288 | ""The last 6 weeks have been intense but I’m glad I have a treasure trove of content now! ""
289 | """"
290 | ""Excited for future iterations of boot camps and building my educational platform! ""
291 | """"
292 | ""#dataengineering",,,MEMBER_NETWORK
293 | 2023-08-24 18:58:49,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100552046415675392,"My 136th video on TikTok was the first one to break a million views ""
294 | ""My 153rd post on Instagram was the first one to break a million views ""
295 | """"
296 | ""Consistency is key! You never know when the algorithm will bless you!""
297 | """"
298 | ""#contentcreation",,,MEMBER_NETWORK
299 | 2023-08-24 07:04:06,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100372182345646080,"The 6th speaker series had Bill Inmon and Jitender Aswani  and was really 🔥!""
300 | """"
301 | ""#dataengineering",,,MEMBER_NETWORK
302 | 2023-08-24 00:25:22,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100271835614744576,The speaker series this week is amazing! #dataengineering ,,,MEMBER_NETWORK
303 | 2023-08-23 18:33:20,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100183242019901441,"My self-paced boot camp teaches much more than just Spark and Flink! ""
304 | """"
305 | ""In the combined six-week program, you will:""
306 | """"
307 | ""- Learn the tradeoffs between Kimball and One Big Table data modeling. Learn how to leverage complex data types like Arrays and Structs to supercharge your analytics""
308 | ""- Create a data pipeline spec that covers quality checks, assumptions, business metrics, and allows stakeholders to give feedback BEFORE you start coding""
309 | ""- Build data quality checks into your pipelines using data contracts such as write-audit-publish and write unit and integration tests to catch quality errors before they enter production""
310 | ""- Set up experiments using Flask and Statsig to learn about A/B tests and how to collect data in logged-out and logged-in environments""
311 | ""- Discover the power of data lake technologies Apache Iceberg. Proper schema evolution, partitioning, and parquet file format compression!  ""
312 | ""- Collaborate with your group on building on-call run books and learn about data pipeline maintenance ""
313 | ""- Learn how to prioritize your tasks for impact, identify low-value tasks, and push back when stakeholders ask you to do them""
314 | ""- Visualize data in the right way to create compelling stories that executives want to see. Create exploratory dashboards in Tableau that data analysts can use to discover patterns ""
315 | ""- Level up your SQL skills by having a four-hour crash course on GROUPING SETS, window functions, and cumulative table design""
316 | ""- Listen to 12 industry-leading experts in Q&A format and get their view on how things are changing in this rapid environment! ""
317 | """"
318 | ""You'll learn all this and level up your Spark and Flink skills! Over 60 hours of content is available now!  ""
319 | """"
320 | ""You can learn more at www.dataengineer.io ""
321 | """"
322 | ""#dataengineering",,,MEMBER_NETWORK
323 | 2023-08-23 17:38:07,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100169347494473728,"Data engineering is less risky than data science and often has more ROI! ""
324 | """"
325 | ""- Data science and data engineering both involve a tremendous amount of data cleaning.  The difference here is the outputs from data engineers are modeled, usable data sets for the rest of the company. The outputs from data scientists are inputs to machine learning models and/or experiments. These models and experiments may or may not produce business-impacting results. The visibility from the modeled data from data engineers has long lasting results. ""
326 | """"
327 | ""- Data engineering problems involve less ambiguity than data science problems. Gathering and collecting data while documenting sources is easier than trying to get it all integrated into your system. People underestimate the complexity of getting a machine learning model running in production.  ""
328 | """"
329 | ""These two factors is why data engineers, on average, saw a pay bump in 2023 and data scientists saw a slight decrease in pay!""
330 | """"
331 | ""#dataengineering ""
332 | ""#datascience",,,MEMBER_NETWORK
333 | 2023-08-23 17:23:54,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7100165768671031297,"Amazing post by Arpit!""
334 | """"
335 | ""#softwareengineering",,,MEMBER_NETWORK
336 | 2023-08-21 17:23:54,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7099440994483388417,I'll be rebranding my data engineer education products from www.eczachly.com to www.dataengineer.io! I'm really excited about this change!  #dataengineering ,,,MEMBER_NETWORK
337 | 2023-08-21 07:06:14,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7099285552222392320,Li Yin is cooking up some 🔥 with SylphAI (AI&data professional network),,,MEMBER_NETWORK
338 | 2023-08-20 16:40:31,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7099067688093700096,"IO is often the biggest component of cloud costs in data engineering! ""
339 | """"
340 | ""How do you minimize it?""
341 | """"
342 | ""- too much IO is caused by pipelines that read too much data ""
343 | ""This can happen when you’re building look back metrics and you aren’t using cumulative table design. These metrics should be built incrementally instead of scanning 30/60/90 days worth of data every day ""
344 | """"
345 | ""- too much IO is caused by tables that are bigger than they need to be ""
346 | ""This can happen when you aren’t leveraging sorting the right way when writing out parquet tables. You should take advantage of run-length encoding the most you can by sorting by lowest to highest cardinality dimensions. ""
347 | """"
348 | ""- too much IO is caused by data models that aren’t robust enough ""
349 | ""Duplicate data models with slightly different metric definitions need to be consolidated and collapsed. Double the data and double the pipelines. Your IO will be excessive!""
350 | """"
351 | ""#dataengineering",,,MEMBER_NETWORK
352 | 2023-08-19 23:29:39,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7098808263873335296,"I set up a really powerful Iceberg + Spark tutorial using Tabular in 2 hours. ""
353 | ""Thanks for unblocking me, Jason and Ryan! Y’all are building something special! ""
354 | ""#dataengineering",,,MEMBER_NETWORK
355 | 2023-08-17 21:14:10,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7098049390417776641,"Data engineering has many ""this or that"" questions""
356 | """"
357 | ""- Python or Scala?""
358 | ""If you don't know either, start with Python. If you want to transition to the software/data engineer archetype, pick up Scala later. ""
359 | """"
360 | ""- Streaming or Batch?""
361 | ""A vast majority of data engineering jobs are batch oriented. This will still be true in ten years. Streaming-oriented data engineering jobs pay better since the skillset is more niche and harder to come by. Remember there's a middle ground with things like microbatch. ""
362 | """"
363 | ""- Kimball or One Big Table? ""
364 | ""Kimball is better at preserving data integrity and for 80+% of the data modeling use cases out there it is going to be the preferred way to model the data. One Big Table has its place though especially if you're trying to minimize shuffle. I've seen some really big performance gains from switching from Kimbal to OBT but just because I saw them at Airbnb doesn't mean you'll see the same. ""
365 | """"
366 | ""- Snowflake or Databricks?""
367 | ""I like Databricks a lot since it has the versatility of Apache Spark. That being said, it's a more technical platform that takes much longer to learn and set up. The amount of time it takes to get value out of Snowflake is very little and that's a very impressive quality of Snowflake.""
368 | """"
369 | ""- AWS or GCP or Azure?""
370 | ""AWS is the clear leader in market share and I have a slight bias towards using it over GCP or Azure. That being said, there will always be great Azure and GCP data engineering jobs as well! ""
371 | """"
372 | ""- Airflow or Mage or Prefect or Dagster?""
373 | ""Airflow is the 9000-pound gorilla in this fight that is looking to be dethroned. The challengers have some really great features that are making Airflow look dated. I'm teaching Airflow in my boot camp though since it has the highest adoption by far""
374 | """"
375 | ""#dataengineering ",,,MEMBER_NETWORK
376 | 2023-08-17 20:47:28,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7098042673013424128,"Great post by Bruno! He's been an amazing student in my boot camp! ""
377 | """"
378 | ""#dataengineering ",,,MEMBER_NETWORK
379 | 2023-08-15 21:42:22,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097331711457255424,"Data architecture always revolves around pushes or pulls!""
380 | """"
381 | ""The ""pull"" architecture is the most common and includes the following technologies:""
382 | """"
383 | ""A batch computing engine such as Apache Spark, BigQuery, Snowflake""
384 | ""A job orchestrator such as Airflow, Mage, Prefect, or Dagster""
385 | ""A place to put the batch of data such as Apache Iceberg, Delta Lake, Snowflake, Druid""
386 | ""An API to query the data on demand such as HTTP or SQL""
387 | """"
388 | ""The ""push"" architecture also called the ""real-time"" architecture is substantially different and includes the following technologies:""
389 | """"
390 | ""A streaming computing engine such as Apache Flink, Spark Structured Streaming ""
391 | ""A set of jobs that run 24/7 to process data as it arrives""
392 | ""A queue of events that are processed such as Apache Kafka or RabbitMQ ""
393 | ""A place to put the streams of data such as Apache Iceberg or Apache Kafka""
394 | ""An API to expose the data in real-time such as Websockets or Kafka consumers""
395 | """"
396 | ""#dataengineering""
397 | ""#softwareengineering ",,,MEMBER_NETWORK
398 | 2023-08-15 21:28:01,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097328099154870274,"People ask me, ""Why do you love JavaScript so much Zach if data engineers never use it?"" ""
399 | """"
400 | ""JavaScript is a more fundamental component to any startup you want to build than SQL is. ""
401 | """"
402 | ""Do you want to build a website? Okay, use React""
403 | ""Do you want to build a server? Okay, use ExpressJS""
404 | ""Do you need a mobile app? Okay, use React Native ""
405 | ""Do you need a data exchange format? Okay, use JSON (stands for JavaScript object notation)""
406 | """"
407 | ""If you don't have a website, server, or app, how do you start generating data? ""
408 | """"
409 | ""Data engineers are hired onto a startup much later because they're only needed after things are bigger and more complex. ""
410 | """"
411 | ""And if you want to unblock yourself so you can get to that point where things are bigger and more complex, learn JavaScript!!! ""
412 | """"
413 | ""#dataengineering ""
414 | ""#softwareengineering ""
415 | """,,,MEMBER_NETWORK
416 | 2023-08-15 19:24:28,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097297007072931840,"Data engineering has a huge community! Especially outside LinkedIn! ""
417 | """"
418 | ""Here are some communities you need to join: ""
419 | """"
420 | ""- Xinran’s Data Engineer Things""
421 | ""Slack channel is here http://join.det.life""
422 | """"
423 | ""- Benjamin’s Seattle Data Guy ""
424 | ""Discord is here: https://lnkd.in/er6bcJBj""
425 | """"
426 | ""- Zach’s EcZachly Inc ""
427 | ""Discord is here: https://lnkd.in/e_qtv8w7""
428 | """"
429 | ""- Chip’s MLOps community ""
430 | ""Discord is here: https://lnkd.in/eqDeG--R ""
431 | """"
432 | ""- Li’s SylphAI (AI&data professional network) ""
433 | ""Discord is here: https://lnkd.in/ePGFVY5A""
434 | """"
435 | ""#dataengineering ""
436 | ""#machinelearning",,,MEMBER_NETWORK
437 | 2023-08-15 17:49:19,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097273063678869505,"Great list of things to get started in AI by Li Yin ""
438 | """"
439 | ""#dataengineering ""
440 | ""#machinelearning",,,MEMBER_NETWORK
441 | 2023-08-15 03:42:04,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097059847388499968,"If you keep buying my boot camps I’ll keep recklessly spending it on neon signs! ""
442 | """"
443 | ""#dataengineering",,,MEMBER_NETWORK
444 | 2023-08-15 00:19:47,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7097008940655280128,This role looks really great! #dataengineering,,,MEMBER_NETWORK
445 | 2023-08-14 20:42:03,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096954143679926272,"I'm going to be a guest speaker for the first Data Engineer Things Book Club AMA Session on Aug 18!""
446 | """"
447 | ""The Data Engineer Things Book Club is currently reading Fundamentals of Data Engineering, by Joe Reis 🤓  and Matthew Housley  It's not too late to join now!""
448 | """"
449 | ""Feel free to join the AMA even if you are not reading the book! ""
450 | """"
451 | ""(Signup link in the comment)""
452 | """"
453 | ""#dataengineering",,,MEMBER_NETWORK
454 | 2023-08-14 06:36:55,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096741461827731456,"Dear JetBrains, ""
455 | """"
456 | ""I’d make videos on how to use your IDEs effectively to fight the good fight against VS Code. I’ve been an avid fan of your products since 2013 when I made the switch from Eclipse to IntelliJ. ""
457 | """"
458 | ""Nowadays,  I use: ""
459 | """"
460 | ""DataGrip for SQL dev ""
461 | ""WebStorm for web dev ""
462 | ""IntelliJ or PyCharm for data engineering ""
463 | """"
464 | ""End-to-end I use y’all’s products. ""
465 | """"
466 | ""I haven’t touched a Microsoft product for development unless you count LinkedIn, GitHub and SQL Server/Windows ten years ago so I promise I’m not tainted! ""
467 | """"
468 | ""#softwareengineering ""
469 | ""#dataengineering",,,MEMBER_NETWORK
470 | 2023-08-14 00:19:01,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096646358916337664,"Deborah is having a great time learning in the self-paced version of EcZachly Inc’s second boot camp!""
471 | """"
472 | ""You can get it here www.EcZachly.com""
473 | """"
474 | ""#dataengineering",,,MEMBER_NETWORK
475 | 2023-08-13 23:15:31,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096630380497158144,"I met Nuseir Yassin founder of Nas Daily Studios""
476 | ""Turns out he used to be a data engineer too! ""
477 | """"
478 | ""#dataengineering",,,MEMBER_NETWORK
479 | 2023-08-13 22:19:08,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7096616191770693632,"Curious what people think about data engineering interviews ""
480 | """"
481 | ""#dataengineering",,,MEMBER_NETWORK
482 | 2023-08-13 06:50:02,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096382371893022720,"Week 5 of 6 of the EcZachly Inc boot camp starts on Monday. For the analytics track, the title is ""KPIs and Experimentation""""
483 | """"
484 | ""The analytics track is doing the following things: ""
485 | """"
486 | ""Day 1:""
487 | """"
488 | ""Lecture:""
489 | ""Learning about why data engineering and experimentation are closely connected""
490 | ""The different types of metrics and where data engineers should pass to analytics partners. ""
491 | ""Deep dive into how to split up your groups and proper experimentation design ""
492 | """"
493 | ""Lab: ""
494 | ""Set up a Flask API using Statsig to run a live experiment end-to-end. Talk about the difference between logged-out and logged-in experiments.""
495 | """"
496 | ""Day 2:""
497 | """"
498 | ""Lecture:""
499 | ""Talk about statistical significance and when an experiment should be launched. ""
500 | ""Talk about how metrics can be gamed and need counter metrics. ""
501 | ""Talk about how experiments can go wrong such as the novelty effect""
502 | """"
503 | ""Lab:""
504 | ""A product sense lab on how to think like a product manager and have a better business impact with the metrics you define""
505 | """"
506 | ""#dataengineering ",,,MEMBER_NETWORK
507 | 2023-08-13 01:56:12,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096308429748637696,"I got experimentation wired up in 90 minutes for my website: www.eczachly.com thanks to Statsig! ""
508 | """"
509 | ""I'll be teaching my boot camp students the importance of data engineering, metrics, and experimentation in the coming week of the boot camp. ""
510 | """"
511 | ""The active experiment I have running on my website is the signup button is red 80% of the time and blue 20% of the time.""
512 | """"
513 | ""The companies that can perform the most experiments in parallel are the companies that are winning!""
514 | """"
515 | ""#dataengineering ""
516 | ""#analyticsengineering ""
517 | ""#datascience ",http://www.eczachly.com,,MEMBER_NETWORK
518 | 2023-08-12 18:06:05,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7096190120206241792,"Y’all gotta check this out! ""
519 | """"
520 | ""#dataengineering",,,MEMBER_NETWORK
521 | 2023-08-11 23:19:31,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095906609784053761,"People underestimate the consistency needed to be successful. ""
522 | """"
523 | ""It took me 250 LinkedIn posts to get to 10k followers. ""
524 | ""Another 250 posts and I was at 100k. ""
525 | """"
526 | ""90% of podcasts don’t make it past episode 3. ""
527 | ""99% of podcasts don’t make it past episode 20. ""
528 | """"
529 | ""You’ll see a dramatic increase in your fitness with one month of consistency at the gym. Most people in the US are overweight or obese. ""
530 | """"
531 | ""When it gets hard, don’t give up. That’s exactly when you need to double down to break through and see success! ""
532 | """"
533 | ""It’s a mindset that will make you feel powerful! ""
534 | """"
535 | ""#mentalhealth",,,MEMBER_NETWORK
536 | 2023-08-11 16:14:32,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095799661088706561,"I’m consistently impressed by the engineering caliber and dedication of my boot camp attendees. ""
537 | """"
538 | ""If you’re hiring for data engineering roles and want a chance to talk to over 150 highly motivated, talented data engineers. ""
539 | """"
540 | ""DM me!""
541 | """"
542 | ""#dataengineering",,,MEMBER_NETWORK
543 | 2023-08-11 02:06:14,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095586177394642944,"I'm doing a real-time streaming lab today for my boot camp. Please vote between these tech creators by visiting their pages""
544 | """"
545 | ""Zach Wilson https://lnkd.in/gCi_-yRx""
546 | ""Sarah Floris https://lnkd.in/gX7cfwhF""
547 | ""Lulu https://lnkd.in/g33ggb-5 ""
548 | """"
549 | ""You don't have to do anything besides clicking on the links!""
550 | """"
551 | ""#dataengineering ",,,MEMBER_NETWORK
552 | 2023-08-10 23:24:06,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7095545377377964032,,,,MEMBER_NETWORK
553 | 2023-08-10 21:58:04,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095523723507609600,"Programming languages I used in big tech based on how much I liked them:""
554 | """"
555 | ""- Kotlin ⭐️⭐️⭐️⭐️⭐️""
556 | ""- Scala ⭐️⭐️⭐️⭐️⭐️""
557 | ""- SQL ⭐️⭐️⭐️⭐️⭐️""
558 | ""- Python ⭐️⭐️⭐️⭐️""
559 | ""- GroovyScript ⭐️⭐️⭐️⭐️""
560 | ""- TypeScript ⭐️⭐️⭐️⭐️""
561 | ""- Bash ⭐️⭐️⭐️""
562 | ""- Java ⭐️⭐️⭐️""
563 | ""- JavaScript ⭐️⭐️⭐️""
564 | ""- Ruby ⭐️""
565 | """"
566 | ""#softwareengineering",,,MEMBER_NETWORK
567 | 2023-08-10 21:13:00,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095512383602765825,"Data contracts are such a hot topic right now in data engineering! ""
568 | """"
569 | ""Imagine if we could prevent all bad data from leaking into production with them? ""
570 | """"
571 | ""This holy grail is actually not possible and can lead to data engineer burnout! ""
572 | """"
573 | ""Your data quality efforts need to focus on ROI to make it so your data engineers don't run away from their jobs! ""
574 | """"
575 | ""#dataengineering ",https://open.substack.com/pub/eczachly/p/writing-data-to-production-is-a-contract?utm_campaign=post&utm_medium=web,,MEMBER_NETWORK
576 | 2023-08-10 21:03:30,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7095509990853021696,"I’m going to do an event on September 5th with:""
577 | """"
578 | ""Ryan Peterman (staff SWE at Meta)""
579 | ""Lee McKeeman (staff SWE at Google) ""
580 | ""Rahul Pandey (former staff SWE at Meta) ""
581 | ""Carly Taylor  (ML manager at Activision)""
582 | """"
583 | ""We’ll talk about the various ways people get to staff engineer! It’ll be a wonderful event. Excited to see you there!! ""
584 | """"
585 | ""#softwareengineering ""
586 | ""#dataengineering",,,MEMBER_NETWORK
587 | 2023-08-10 18:03:51,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095464783272955904,"Why aren’t there many entry-level data engineering roles? ""
588 | """"
589 | ""There’s a few reasons for this: ""
590 | """"
591 | ""- data engineers produce data that is relied on by many people. Relying on a zero years of experience person to do that is a little riskier.  ""
592 | """"
593 | ""- data engineering requires a unique blend of communication skills and technical skills, like data science, that makes it harder for juniors to ramp up effectively.  ""
594 | """"
595 | ""- many companies only need one or two data engineers. So there’s no mentorship/growth path for juniors at these companies. Therefore they prefer to hire senior engineers. ""
596 | """"
597 | ""Why else do you think there aren’t many Junior data engineering positions? ""
598 | """"
599 | ""#dataengineering",,,MEMBER_NETWORK
600 | 2023-08-09 23:50:00,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095189504067588096,This is covered in depth in EcZachly Inc boot camp!,,,MEMBER_NETWORK
601 | 2023-08-09 19:47:01,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095128355447984128,"India is on the verge of bulldozing the US in tech. It seems far fetched right now but just like LLMs, India is on an exponential trajectory.""
602 | """"
603 | ""#india",,,MEMBER_NETWORK
604 | 2023-08-09 17:06:55,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095088067748757506,"Don’t lose yourself climbing the ladder! ""
605 | """"
606 | ""Remember to go to the beach sometimes. ""
607 | ""Remember to go to the mountains sometimes. ""
608 | ""Remember to laugh really hard sometimes. ""
609 | ""Remember to dance badly sometimes. ""
610 | ""Remember to check in with body sometimes. ""
611 | ""Remember to be grateful for this beautiful life!""
612 | """"
613 | ""#mentalhealth",,,MEMBER_NETWORK
614 | 2023-08-09 15:51:09,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7095069000107241472,Still got 2 1/2 weeks left in EcZachly Inc’s boot camp too!,,,MEMBER_NETWORK
615 | 2023-08-09 04:32:37,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7094898241275658240,"Need to find a mentor in the data and AI space? ""
616 | """"
617 | ""Follow Data Engineer Things, they offer tons of data engineering mentoring. Founder is Xinran Waibel, Netflix DE ""
618 | """"
619 | ""Follow SylphAI, they’re an AI community that mentors and helps you reach your goals. Founder is Li Yin, ex-Meta research scientist""
620 | """"
621 | ""Follow Illuminate AI, they do AI mentorship. Founder is Aishwarya Srinivasan, Google data scientist""
622 | """"
623 | ""#dataengineering ""
624 | ""#machinelearning ""
625 | ""#artificialintelligence",,,MEMBER_NETWORK
626 | 2023-08-08 18:45:26,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7094750471185182720,"Learning technical things beyond data pipelines will make you a better data engineer!""
627 | """"
628 | ""- live servers have highly quality requirements!""
629 | ""If a server goes down, your business dies. ""
630 | ""If a data pipeline is delayed, an analyst is sad. ""
631 | """"
632 | ""Learning to deal with higher stakes technical requirements will help you see how to build higher quality data pipelines! ""
633 | """"
634 | ""Higher quality meaning: ""
635 | ""- tested in CI/CD""
636 | ""You should have unit and integration tests for your queries so you don’t push a bad change to your pipeline. ""
637 | """"
638 | ""- monitored in production ""
639 | ""Is your pipeline telemetry changing? Is skew hurting the performance? Can you make things more efficient? ""
640 | """"
641 | ""- documented for other engineers ""
642 | ""How do you troubleshoot when things break? Who do you talk to when quality errors arise? ""
643 | """"
644 | ""You’d be surprised how much full-stack engineering made me a better data engineer. There aren’t enough data engineers who care about this stuff which leads to the perception that data engineers are less technical than software engineers! ""
645 | """"
646 | ""#dataengineering",,,MEMBER_NETWORK
647 | 2023-08-08 18:13:19,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7094742389843529728,"There is such a thing as too much data quality! ""
648 | """"
649 | ""Everything in engineering, including data quality, comes with trade offs. ""
650 | """"
651 | ""Symptoms of too much data quality: ""
652 | """"
653 | ""- noisy on-call ""
654 | ""You have checks for every possible anomaly under the sun. The more checks, the more likely they fail. The more tax you pay maintaining the quality of your pipelines. ""
655 | """"
656 | ""Every DQ check has a probability of false positive that takes away from real engineering time. Thinking about the ROI on these checks before implementing will help you strike the right balance. ""
657 | """"
658 | ""- slow pipeline design phases ""
659 | ""Trying to incorporate every request and constraint into your design document can be taxing. Acceping that data model creation is iterative will help you move faster here. ""
660 | """"
661 | ""Don’t cut corners though! Do your best to incorporate as many stakeholder requirements as you can and cut the ones that provide the lowest ROI.""
662 | """"
663 | """"
664 | ""These problems are actually pretty rare in industry since 95% of companies index too lightly on data quality. But just like everything, you don’t want to index to heavily in the other direction either!""
665 | """"
666 | ""#dataengineering",,,MEMBER_NETWORK
667 | 2023-08-07 21:52:11,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7094435079472517120,"Dear LinkedIn, ""
668 | """"
669 | ""Can you please put the engagement metrics into your Shares.csv GDPR export file? ""
670 | """"
671 | ""Twitter does it. ""
672 | ""YouTube does it. ""
673 | ""Instagram does it. ""
674 | ""TikTok does it. ""
675 | """"
676 | ""You’re literally the only platform that doesn’t do it! ""
677 | """"
678 | ""#dataengineering",,,MEMBER_NETWORK
679 | 2023-08-07 18:42:14,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7094387278663323648,"Things you should never see in production SQL pipelines: ""
680 | """"
681 | ""- SELECT * ""
682 | ""- RIGHT JOIN""
683 | ""- GROUP BY 1,2,3 / ORDER BY 1,2,3""
684 | ""- Derived columns without aliases ""
685 | ""- Nested subqueries ""
686 | """"
687 | ""What would you add? ""
688 | """"
689 | ""#dataengineering",,,MEMBER_NETWORK
690 | 2023-08-07 15:23:32,https://www.linkedin.com/feed/update/urn%3Ali%3AugcPost%3A7094337272275222528,"Week 4 of EcZachly Inc’s boot camp starts today. The themes are analytical patterns and streaming pipelines! ""
691 | """"
692 | ""#dataengineering",,,MEMBER_NETWORK
693 | 2023-08-06 16:49:19,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7093996471263391744,"Rules of LinkedIn etiquette:""
694 | """"
695 | ""- using more than 3 hashtags looks bad ""
696 | ""- don’t ask for a resume review or a referral on the first DM, I don’t know you ""
697 | ""- don’t cold sell someone on the first DM, you’ll waste your Inmail credits ""
698 | ""- don’t tag more than 3-4 people in a post unless it’s a group event ""
699 | ""- if you comment, have it be something positive or meaningful ""
700 | ""- entry-level positions require zero years of experience ""
701 | ""- treating people like humans instead of job-givers will get you much further ""
702 | ""- have a profile picture that’s public, clear and up-to-date ""
703 | ""- use your headline to clarify what you do ""
704 | """"
705 | ""Any more you’d add? ""
706 | """"
707 | ""#linkedin",,,MEMBER_NETWORK
708 | 2023-08-06 16:31:30,https://www.linkedin.com/feed/update/urn%3Ali%3Ashare%3A7093991989511147520,"If you’re using the RDD API directly in Spark, you’re doing it wrong!""
709 | """"


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llm-driven-data-engineering"
 3 | version = "0.1.0"
 4 | description = "LLM-driven Data Engineering by Zach Wilson"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "openai",
 9 |     "pandas",
10 |     "bardapi",
11 |     "langchain",
12 |     "langchain_experimental",
13 |     "psycopg2",
14 |     "google-auth-oauthlib",
15 |     "google-api-python-client",
16 | ]
17 | 


--------------------------------------------------------------------------------
/schemas/actor_films.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE actor_films (
 2 |     Actor TEXT,
 3 |     ActorId Text,
 4 |     Film TEXT,
 5 |     Year integer,
 6 |     votes Integer,
 7 |     Rating REAL,
 8 |     FilmID text,
 9 |     PRIMARY KEY(ActorId, FilmId)
10 | )


--------------------------------------------------------------------------------
/schemas/game_details.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE public.game_details (
 2 |     game_id integer,
 3 |     team_id integer,
 4 |     team_abbreviation text,
 5 |     team_city text,
 6 |     player_id integer,
 7 |     player_name text,
 8 |     nickname text,
 9 |     start_position text,
10 |     comment text,
11 |     min text,
12 |     fgm real,
13 |     fga real,
14 |     fg_pct real,
15 |     fg3m real,
16 |     fg3a real,
17 |     fg3_pct real,
18 |     ftm real,
19 |     fta real,
20 |     ft_pct real,
21 |     oreb real,
22 |     dreb real,
23 |     reb real,
24 |     ast real,
25 |     stl real,
26 |     blk real,
27 |     "TO" real,
28 |     pf real,
29 |     pts real,
30 |     plus_minus real
31 | );


--------------------------------------------------------------------------------
/schemas/games.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE public.games (
 2 |     game_date_est date,
 3 |     game_id integer NOT NULL,
 4 |     game_status_text text,
 5 |     home_team_id integer,
 6 |     visitor_team_id integer,
 7 |     season integer,
 8 |     team_id_home integer,
 9 |     pts_home real,
10 |     fg_pct_home real,
11 |     ft_pct_home real,
12 |     fg3_pct_home real,
13 |     ast_home real,
14 |     reb_home real,
15 |     team_id_away integer,
16 |     pts_away real,
17 |     fg_pct_away real,
18 |     ft_pct_away real,
19 |     fg3_pct_away real,
20 |     ast_away real,
21 |     reb_away real,
22 |     home_team_wins integer
23 | );


--------------------------------------------------------------------------------
/schemas/player_seasons.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE public.player_seasons (
 2 |     player_name text NOT NULL,
 3 |     age integer,
 4 |     height text,
 5 |     weight integer,
 6 |     college text,
 7 |     country text,
 8 |     draft_year text,
 9 |     draft_round text,
10 |     draft_number text,
11 |     gp real,
12 |     pts real,
13 |     reb real,
14 |     ast real,
15 |     netrtg real,
16 |     oreb_pct real,
17 |     dreb_pct real,
18 |     usg_pct real,
19 |     ts_pct real,
20 |     ast_pct real,
21 |     season integer NOT NULL
22 | );


--------------------------------------------------------------------------------
/schemas/players.sql:
--------------------------------------------------------------------------------
 1 |  CREATE TYPE season_stats AS (
 2 |                          season Integer,
 3 |                          pts REAL,
 4 |                          ast REAL,
 5 |                          reb REAL,
 6 |                          weight INTEGER
 7 |                        );
 8 |  CREATE TYPE scorer_class AS
 9 |      ENUM ('bad', 'average', 'good', 'star');
10 | 
11 | 
12 |  CREATE TABLE players (
13 |      player_name TEXT,
14 |      height TEXT,
15 |      college TEXT,
16 |      country TEXT,
17 |      draft_year TEXT,
18 |      draft_round TEXT,
19 |      draft_number TEXT,
20 |      seasons season_stats[],
21 |      scoring_class scorer_class,
22 |      is_active BOOLEAN,
23 |      current_season INTEGER,
24 |      PRIMARY KEY (player_name, current_season)
25 |  );
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/schemas/players_scd_table.sql:
--------------------------------------------------------------------------------
 1 | create table players_scd_table
 2 | (
 3 | 	player_name text,
 4 | 	scoring_class scorer_class,
 5 | 	is_active boolean,
 6 | 	start_date integer,
 7 | 	end_date integer,
 8 | 	is_current boolean
 9 | );
10 | 
11 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataExpert-io/llm-driven-data-engineering/be4dc8ef785eef8f4b0213c5057d850874ba0e1b/src/__init__.py


--------------------------------------------------------------------------------
/src/generate_dag_script.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | from util import get_api_key
 4 | openai.api_key = get_api_key()
 5 | 
 6 | schema_files = os.listdir('schemas')
 7 | 
 8 | all_schemas = {}
 9 | 
10 | for file in schema_files:
11 |     opened_file = open('schemas/' + file, 'r')
12 |     all_schemas[file] = opened_file.read()
13 | 
14 | system_prompt = """
15 |             You are a data engineer looking to generate an Airflow pipeline DAG skeleton 
16 |             without the SQL details
17 |             """
18 | 
19 | user_prompt = f"""
20 |                 Generate a cumulative Airflow DAG that transforms 
21 |                 {all_schemas['player_seasons.sql']}
22 |                 into {all_schemas['players.sql']}
23 |                 use markdown for output and Postgres for queries
24 |                 The DAG depends on last season data from players table 
25 |                 and the DAG depends on past is true
26 |                 Make sure each run scans only one season and does a 
27 |                 FULL OUTER JOIN with the previous seasons data
28 |                 Use the {{ ds }} airflow parameter to filter season 
29 |                 All create table statements should include IF NOT EXISTS
30 |             """
31 | 
32 | print(system_prompt)
33 | print(user_prompt)
34 | 
35 | response = openai.chat.completions.create(
36 |     model="gpt-4",
37 |     messages=[
38 |         {"role": "system", "content": system_prompt},
39 |         {"role": "user", "content": user_prompt}
40 |     ],
41 |     temperature=0
42 | )
43 | answer = response.choices[0].message.content
44 | 
45 | if not os.path.exists('output'):
46 |     os.mkdir('output')
47 | 
48 | output = filter(lambda x: x.startswith('python'), answer.split('```'))
49 | # Open the file with write permissions
50 | with open('output/airflow_dag.py', 'w') as file:
51 |     # Write some data to the file
52 |     file.write('\n'.join(output))
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/src/generate_documentation_script.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | from util import get_api_key
 4 | openai.api_key = get_api_key()
 5 | 
 6 | schema_files = os.listdir('schemas')
 7 | 
 8 | all_schemas = {}
 9 | 
10 | for file in schema_files:
11 |     opened_file = open('schemas/' + file, 'r')
12 |     all_schemas[file] = opened_file.read()
13 | 
14 | system_prompt = """You are a data engineer looking to create documentation and example queries for your data sets"""
15 | 
16 | user_prompt = f"""Using cumulative table input schema {all_schemas['players.sql']}
17 |                  Generate a pipeline documentation in markdown 
18 |                     that shows how this is generated from 
19 |                 {all_schemas['player_seasons.sql']}
20 |                 make sure to include example queries that use the season stats array
21 |                 make sure to document all columns with column comments
22 |                 make sure to document all created types as well
23 |             """
24 | 
25 | print(system_prompt)
26 | print(user_prompt)
27 | 
28 | response = openai.chat.completions.create(
29 |     model="gpt-4",
30 |     messages=[
31 |         {"role": "system", "content": system_prompt},
32 |         {"role": "user", "content": user_prompt}
33 |     ],
34 |     temperature=0
35 | )
36 | answer = response.choices[0].message.content
37 | 
38 | if not os.path.exists('output'):
39 |     os.mkdir('output')
40 | # Open the file with write permissions
41 | with open('output/documentation.md', 'w') as file:
42 |     # Write some data to the file
43 |     file.write(answer)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/generate_sql_script.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | from util import get_api_key
 4 | openai.api_key = get_api_key()
 5 | 
 6 | schema_files = os.listdir('schemas')
 7 | 
 8 | all_schemas = {}
 9 | 
10 | for file in schema_files:
11 |     opened_file = open('schemas/' + file, 'r')
12 |     all_schemas[file] = opened_file.read()
13 | 
14 | system_prompt = """You are a data engineer looking to create a slowly-changing dimension table query"""
15 | 
16 | user_prompt = f"""Using cumulative table input schema {all_schemas['players.sql']}
17 |                     and expected output schema {all_schemas['players_scd_table.sql']} 
18 |                     generate a query to do a slowly-changing dimension 
19 |                     transformation tracking changes on the dimensions is_active and scoring_class, 
20 |                     use markdown and SQL for the transformation
21 |             """
22 | 
23 | print(system_prompt)
24 | print(user_prompt)
25 | 
26 | response = openai.chat.completions.create(
27 |     model="gpt-4",
28 |     messages=[
29 |         {"role": "system", "content": system_prompt},
30 |         {"role": "user", "content": user_prompt}
31 |     ],
32 |     temperature=0
33 | )
34 | print(response)
35 | answer = response.choices[0].message.content
36 | 
37 | 
38 | if not os.path.exists('output'):
39 |     os.mkdir('output')
40 | 
41 | # ```sql
42 | # SELECT * FROM table
43 | # ```
44 | 
45 | output = filter(lambda x: x.startswith('sql'), answer.split('```'))
46 | # Open the file with write permissions
47 | with open('output/player_scd_generation.sql', 'w') as file:
48 |     # Write some data to the file
49 |     file.write('\n'.join(output))
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/src/langchain/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataExpert-io/llm-driven-data-engineering/be4dc8ef785eef8f4b0213c5057d850874ba0e1b/src/langchain/__init__.py


--------------------------------------------------------------------------------
/src/langchain/database_query_example.py:
--------------------------------------------------------------------------------
 1 | from langchain import OpenAI, SQLDatabase
 2 | from langchain_experimental.sql import SQLDatabaseChain
 3 | from langchain.agents.agent_toolkits.gmail.toolkit import GmailToolkit
 4 | from langchain.tools.gmail.utils import build_resource_service, get_gmail_credentials
 5 | from langchain.agents import initialize_agent, AgentType
 6 | import os
 7 | API_KEY = os.environ['OPENAI_API_KEY']
 8 | # setup llm
 9 | llm = OpenAI(temperature=0, openai_api_key=API_KEY, model_name='gpt-4')
10 | 
11 | 
12 | def create_gmail_toolkit():
13 |     # Can review scopes here https://developers.google.com/gmail/api/auth/scopes
14 |     # For instance, readonly scope is 'https://www.googleapis.com/auth/gmail.readonly'
15 |     credentials = get_gmail_credentials(
16 |         token_file="token.json",
17 |         scopes=["https://mail.google.com/"],
18 |         client_secrets_file="credentials.json",
19 |     )
20 |     api_resource = build_resource_service(credentials=credentials)
21 |     return GmailToolkit(api_resource=api_resource)
22 | 
23 | 
24 | def create_gmail_agent(llm, toolkit):
25 |     return initialize_agent(
26 |         tools=toolkit.get_tools(),
27 |         llm=llm,
28 |         agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
29 |     )
30 | 
31 | 
32 | # setup email agent if you want to send the results somewhere
33 | # Make sure to have a credentials.json file with a google app with Gmail enabled
34 | # Create a google app here: https://console.cloud.google.com/apis
35 | # The first time this runs, you'll be asked to give permission and it will create a token.json file
36 | YOUR_EMAIL = ''
37 | agent = None
38 | if YOUR_EMAIL:
39 |     toolkit = create_gmail_toolkit()
40 |     agent = create_gmail_agent(llm, toolkit)
41 | 
42 | database_url = os.environ['LANGCHAIN_DATABASE_URL']
43 | # TODO Change to this if you are using the local database
44 | # database_url = 'postgresql+psycopg2://localhost:5432/postgres'
45 | 
46 | # Setup database
47 | db = SQLDatabase.from_uri(
48 |     database_url,
49 |     include_tables=['medals', 'match_details', 'matches', 'medals_matches_players']
50 | )
51 | 
52 | 
53 | # Create db chain
54 | QUERY = """
55 | Given an input question, first create a syntactically correct postgresql query to run, then look at the results of the query and return the answer.
56 | Use the following format:
57 | 
58 | Question: Question here
59 | SQLQuery: SQL Query to run
60 | SQLResult: Result of the SQLQuery
61 | Answer: Final answer here
62 | 
63 | {question}
64 | """
65 | 
66 | # Setup the database chain
67 | db_chain = SQLDatabaseChain(llm=llm, database=db, verbose=True)
68 | 
69 | 
70 | 
71 | def get_prompt():
72 |     print("Type 'exit' to quit")
73 | 
74 |     while True:
75 |         prompt = input("Enter a prompt: ")
76 | 
77 |         if prompt.lower() == 'exit':
78 |             print('Exiting...')
79 |             break
80 |         else:
81 |             try:
82 |                 question = QUERY.format(question=prompt)
83 |                 results = db_chain.run(question)
84 |                 print(results)
85 |                 if agent:
86 |                     agent.run(
87 |                         "Create an email with these results in a table: " + results +
88 |                         " Then send it to {email} with the subject".format(email=YOUR_EMAIL) +
89 |                         " " + prompt
90 |                     )
91 |             except Exception as e:
92 |                 print(e)
93 | 
94 | get_prompt()


--------------------------------------------------------------------------------
/src/langchain/write_a_post_like_zach.py:
--------------------------------------------------------------------------------
 1 | from langchain.llms import OpenAI
 2 | from langchain.prompts import PromptTemplate
 3 | from langchain.chains import LLMChain
 4 | 
 5 | import re
 6 | import csv
 7 | 
 8 | 
 9 | with open('data/zachs_posts.csv') as f:
10 |     reader = csv.DictReader(f)
11 |     topic = 'data'
12 |     all_lines = ""
13 |     for line in reader:
14 |         if topic.lower() in line['ShareCommentary'].lower():
15 |             all_lines += (' ' + re.sub(r'[^a-zA-Z0-9 ]', '', line['ShareCommentary']))
16 |     llm = OpenAI(temperature=0, model_name='gpt-4')
17 |     template = """
18 |             Here are all of Zach Wilson's LinkedIn posts {lines}: 
19 |             Write a new LinkedIn post about {{topic}} in Zach Wilson's voice"""\
20 |     .format(lines=all_lines)
21 |     prompt = PromptTemplate(
22 |         input_variables=["topic"],
23 |         template=template,
24 |     )
25 |     chain = LLMChain(llm=llm, prompt=prompt)
26 |     print(chain.run(topic=topic))
27 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | # Gets the API key from the environment
5 | def get_api_key():
6 |     API_KEY = os.environ.get("OPENAI_API_KEY")
7 |     if API_KEY is None:
8 |         raise ValueError("You need to specify OPENAI_API_KEY environment variable!")
9 |     return API_KEY


--------------------------------------------------------------------------------