├── .gitignore ├── DataEngRoadmap.png ├── DataEngRoadmap.xml ├── DataEngRoadmapFull.png ├── README.md └── welcomes ├── welcome.go ├── welcome.java ├── welcome.py ├── welcome.scala └── welcome.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /DataEngRoadmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ErdemOzgen/Data-Engineering-Roadmap/dc95f8fee0d8c52018ef54f5f603d8f25293640a/DataEngRoadmap.png -------------------------------------------------------------------------------- /DataEngRoadmap.xml: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /DataEngRoadmapFull.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ErdemOzgen/Data-Engineering-Roadmap/dc95f8fee0d8c52018ef54f5f603d8f25293640a/DataEngRoadmapFull.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Disclaimer 2 | > The purpose of this roadmap is to give you an idea about the landscape. The road map will guide you if you are confused about what to learn next, rather than encouraging you to pick what is hype and trendy. You should grow some understanding of why one tool would be better suited for some cases than the other and remember hype and trendy does not always mean best suited for the job. 3 | ## Give a Star! :star: 4 | If you like or are using this project to learn or start your solution, please give it a star. Thanks! 5 | ## Roadmap 6 | ![Roadmap](./DataEngRoadmap.png) 7 | 8 | ## Programming Languages 9 | * [Python Roadmap](https://github.com/ErdemOzgen/Python-developer-roadmap) 10 | * [Java Roadmap](https://github.com/s4kibs4mi/java-developer-roadmap) 11 | * [Scala Docs](https://docs.scala-lang.org/) 12 | * [Golang Roadmap](https://github.com/Alikhll/golang-developer-roadmap) 13 | 14 | ## Learn Linux 15 | 16 | **There is two main parts for Linux learning: System Administration and Shell Scripting. You can arrange your learning depth with your preference** 17 | * [Linux Bible, 10th Edition from Christopher Negus](https://www.wiley.com/en-us/Linux+Bible,+10th+Edition-p-9781119578895#content-section) 18 | * [Linux Command Line and Shell Scripting Bible, 4th Edition from Richard Blum, Christine Bresnahan](https://www.wiley.com/en-gb/Linux+Command+Line+and+Shell+Scripting+Bible%2C+4th+Edition-p-9781119700937) 19 | 20 | ## Data Structures and Algorithms / System Design 21 | * [Neetcode Leetcode](https://github.com/neetcode-gh/leetcode) ==> use for all languages you learned. 22 | * [Desing Patterns](https://refactoring.guru/) 23 | * [Interview University](https://github.com/jwasham/coding-interview-university) 24 | * [Data Structures and Algorithms Book recommendation](https://github.com/jwasham/coding-interview-university#books-for-data-structures-and-algorithms) 25 | 26 | ## SQL 27 | There are a number of good introductory SQL resources available for free and online. There are also some paid resources which I recommend for beginners, that are very effective, and well worth expensing in my opinion. 28 | A couple of notes: 29 | - I haven’t used all of these resources, but they come with strong recommendations around the web or myself/my peers. 30 | - You absolutely don’t need to use every single resource. Find a couple that work for you, and go to town. 31 | - You can always reach out to me if you have questions. I always paste this online when people are new to asking very technical questions – it’s not meant to be snarky – it's a gentle guide on how to compose your questions and gather necessary resources in order to best give technical people the information needed to get a quick/effective response: http://www.mikeash.com/getting_answers.html 32 | #### Video/Class/Mini-course based: 33 | 1. [Stanford Self-paced ‘Database’ course](https://class.stanford.edu/courses/DB/2014/SelfPaced/about) 34 | - The original Coursera coursed has been converted into a series of mini-courses, which are all self-paced, and thorough. 35 | 2. Portnov Computer School "SQL Tutorial for beginners” 36 | This is a mini-course (~4 hours in total) which is said to be quite good. 37 | Links: 38 | - Video 1: https://www.youtube.com/watch?v=xaRrTBmMp30 39 | - Video 2: https://www.youtube.com/watch?v=1sMR2ApQVvw 40 | - Video 3: https://www.youtube.com/watch?v=deegPjmasq8 41 | - Video 4: https://www.youtube.com/watch?v=vHE-EeLaYsI 42 | #### Book/Tutorial Format (some interactive): 43 | 1. [SQL Problems and Solutions – Interactive book](http://www.sql-tutorial.ru/) 44 | “…student[sic] can ask questions and get the answers even if such answers cannot be found in the textbook. To a certain extent interactive textbook is intended to substitute a teacher/advisor, which is, to our mind, indispensable requirement for the use of such teaching materials within the system of distance learning" 45 | 2. [Learn SQL The Hard Way](http://sql.learncodethehardway.org/) 46 | "This book will teach you the 80% of SQL you probably need to use it effectively, and will mix in concepts in data modeling at the same time. If you've been fumbling around building web, desktop, or mobile applications because you don't know SQL, then this book is for you. It is written for people with no prior database, programming, or SQL knowledge, but knowing at least one programming language will help." 47 | 3. [GalaXQL](http://sourceforge.net/projects/galaxql/) 48 | "GalaXQL is a fun SQL tutorial where the database is a galaxy of stars that is rendered in 3D. Watch the galaxy change as your SQL commands create, modify, and destroy heavenly objects. What could be more fun?" 49 | 4. [PostgreSQL Tutorial](http://www.postgresqltutorial.com/) 50 | "We developed the PostgreSQL tutorial to demonstrate the unique features of PostgreSQL that make it the most advanced open source database management system in the world. In addition, we will show you how to leverage those features to make your application faster and more secure." 51 | 5. [Head First SQL](http://www.headfirstlabs.com/books/hfsql/) 52 | An excellent resource for beginners, I went through years ago. I highly recommend picking up a copy if you truly want to start at the ground level. It’s a big book, but the font size is large, and there are exercises / pictures etc. It takes about 1-2 days to get through, maybe a week spread out. 53 | “Is your data dragging you down? Are your tables all tangled up? Well we've got the tools to teach you just how to wrangle your databases into submission. Using the latest research in neurobiology, cognitive science, and learning theory to craft a multi-sensory SQL learning experience, Head First SQL has a visually rich format designed for the way your brain works, not a text-heavy approach that puts you to sleep. 54 | Maybe you've written some simple SQL queries to interact with databases. But now you want more, you want to really dig into those databases and work with your data. Head First SQL will show you the fundamentals of SQL and how to really take advantage of it. We'll take you on a journey through the language, from basic INSERT statements and SELECT queries to hardcore database manipulation with indices, joins, and transactions. We all know "Data is Power"—but we'll show you how to have "Power over your Data". Expect to have fun, expect to learn, and expect to be querying, normalizing, and joining your data like a pro by the time you're finished reading!" 55 | #### Practice resources: 56 | 1. [SchemaVerse](https://schemaverse.com/) 57 | "The Schemaverse is a space-based strategy game implemented entirely within a PostgreSQL database. Compete against other players using raw SQL commands to command your fleet. Or, if your PL/pgSQL-foo is strong, wield it to write AI and have your fleet command itself!" 58 | 2. [SqlEx](http://www.sql-ex.ru/) 59 | An extension of the sql-tutorial.ru book with practice exercises. 60 | 3. [SQLZoo](http://sqlzoo.net/wiki/Main_Page) 61 | Some tutorials and practice exercises 62 | 4. [PostgreSQL Exercises](http://pgexercises.com/index.html) 63 | "This site was born when I noticed that there's a load of material out there to help people learn about SQL, but not a great deal to make it easy to learn by doing. PGExercises provides a series of questions and explanations built on a single, simple dataset. It's designed for use as a partner to a good book or Postgres' excellent documentation. 64 | The exercises on this site range from simple select and where clauses, through joins and case statements, and on to aggregations, window functions, and recursive queries. Most people who aren't already pros should find something to test themselves with." 65 | 66 | ## Testing 67 | 68 | * Unit Testing 69 | * Integration testing 70 | * Functional testing 71 | * [Agile Testing: A Practical Guide for Testers and Agile Teams](https://www.amazon.com/Agile-Testing-Practical-Guide-Testers/dp/0321534468) 72 | * [The Art of Software Testing](https://www.amazon.com/Art-Software-Testing-Glenford-Myers/dp/1118031962) 73 | 74 | 75 | ## CI/CD and Virtualization 76 | 77 | * [Ansible](https://www.ansible.com/) 78 | * [Jenkins](https://www.jenkins.io/) 79 | * [Docker](https://www.docker.com/) 80 | * [Kubernetes](https://kubernetes.io/) 81 | * [Terraform](https://www.terraform.io/) 82 | * [Aws CDK](https://aws.amazon.com/cdk/) 83 | 84 | ## Database Fundamentals 85 | 86 | * [SQL](#sql) 87 | * Normalisation 88 | * ACID transactions 89 | * CAP Theorem 90 | * OLTPS vs OLAP 91 | * Horizontal vs Vertical Scaling 92 | * Dimensional Modeling 93 | 94 | ## Relational Database 95 | 96 | * [MySQL](https://www.mysql.com/) 97 | * [PostgreSQL](https://www.postgresql.org/) 98 | * [MariaDB](https://mariadb.org/) 99 | * [Amazon Aurora](https://aws.amazon.com/rds/aurora/) 100 | 101 | ## Non-Relational Databases 102 | 103 | * Document 104 | * [MongoDB](https://www.mongodb.com/) 105 | * [Elasticsearch](https://www.elastic.co/) 106 | * [Apache CouchDB](https://couchdb.apache.org/) 107 | * [Azure CosmosDB](https://learn.microsoft.com/en-us/azure/cosmos-db/) 108 | * Wide Column 109 | * [Apache Cassandra](https://cassandra.apache.org/_/index.html) 110 | * [Apache HBase](https://hbase.apache.org/) 111 | * [Google Bigtable](https://cloud.google.com/bigtable) 112 | * Graph 113 | * [Neo4j](https://neo4j.com/) 114 | * [Amazon Neptune](https://aws.amazon.com/neptune/) 115 | * Key-Value 116 | * [Redis](https://redis.io/) 117 | * [Memcached](https://memcached.org/) 118 | * [Amazon DynamoDB](https://aws.amazon.com/dynamodb/) 119 | 120 | ## Data Processing 121 | 122 | * Batch 123 | * [Apache Pig](https://pig.apache.org/) 124 | * [Data Build Tool](https://www.getdbt.com/) 125 | 126 | * Stream 127 | * [Apache Kafka](https://kafka.apache.org/) 128 | * [Apache Storm](https://storm.apache.org/) 129 | 130 | * Hybrid 131 | * [Apache Spark](https://spark.apache.org/) 132 | * [Apache Beam](https://beam.apache.org/) 133 | * [Apache Flink](https://flink.apache.org/) 134 | 135 | ## Messaging 136 | 137 | * [RabbitMQ](https://www.rabbitmq.com/) 138 | * [Apache ActiveMQ](https://activemq.apache.org/) 139 | 140 | ## Cluster Computing Fundamentals 141 | 142 | * [Apache Hadoop](https://hadoop.apache.org/) 143 | * [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) 144 | * [MapReduce](https://www.ibm.com/topics/mapreduce) 145 | * [Lambda & Kappa Architecture](https://towardsdatascience.com/a-brief-introduction-to-two-data-processing-architectures-lambda-and-kappa-for-big-data-4f35c28005bb) 146 | * [Managed Hadoop](https://aws.amazon.com/emr/features/hadoop/) 147 | 148 | 149 | ## Object storage 150 | 151 | * [AWS S3](https://aws.amazon.com/s3/) 152 | * [Google Cloud Storage](https://cloud.google.com/storage) 153 | 154 | ## Datawarehouses 155 | 156 | * [Snowflake](https://www.snowflake.com/en/) 157 | * [Amazon Redshift](https://aws.amazon.com/redshift/) 158 | * [Apache Hive](https://hive.apache.org/) 159 | * [Google BigQuery](https://cloud.google.com/bigquery) 160 | * [ClickHouse](https://github.com/ClickHouse/ClickHouse) 161 | 162 | 163 | 164 | ## Monitoring Datapipelines 165 | 166 | * [Prometheus](https://prometheus.io/) 167 | * [Datadog](https://www.datadoghq.com/) 168 | * [Sentry](https://sentry.io/welcome/) 169 | 170 | 171 | 172 | ## Data Visualization 173 | 174 | * [Jupyter Notebook](https://jupyter.org/) 175 | * [Looker](https://www.looker.com/) 176 | * [Grafana](https://grafana.com/) 177 | * [Tableau](https://www.tableau.com/) 178 | 179 | 180 | ## Machine Learning and Deep Learning Tools 181 | 182 | * Math 183 | * Statistics and Probability 184 | * [Pandas](https://pandas.pydata.org/) 185 | * [NumPy](https://numpy.org/) 186 | * [Tensorflow](https://www.tensorflow.org/) 187 | * [Keras](https://keras.io/) 188 | * [Pytorch](https://pytorch.org/) 189 | * [Scikit-learn](https://scikit-learn.org/stable/) 190 | * #### Data Science Resources 191 | - :books: [DPhi-Data Science Courses](https://dphi.tech/) 192 | - :books: [Data Science Methodology](https://cognitiveclass.ai/courses/data-science-methodology-2) 193 | - :bulb: [Data Science Cheat Sheets](https://www.kaggle.com/timoboz/data-science-cheat-sheets) 194 | - :bulb: [Data Science Roadmap](https://www.scaler.com/blog/data-science-roadmap/) 195 | - :books: [IBM Data Science Coursera](https://www.coursera.org/professional-certificates/ibm-data-science) 196 | - :video_camera: [Introduction to Data Science with R](https://www.youtube.com/watch?v=32o0DnuRjfg&list=PLTJTBoU5HOCRrTs3cJK-PbHM39cwCU0PF&index=1) 197 | - :bulb: [Machine Learning Algorithms from Scratch](https://github.com/python-engineer/MLfromscratch) 198 | - :books: [Python for Data Science: Fundamentals](https://www.dataquest.io/course/python-for-data-science-fundamentals) 199 | - :books: [Python for Data Science: Intermediate](https://www.dataquest.io/course/python-for-data-science-intermediate/) 200 | 201 | 202 | * #### Machine Learning Resources 203 | - :books: [Google's Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/) 204 | - :books: [Andrew Ng's Machine Learning Coursera Course](https://www.coursera.org/learn/machine-learning) 205 | - :books: [Intro to Machine Learning](https://www.kaggle.com/learn/intro-to-machine-learning) 206 | - :books: [Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning) 207 | - :books: [Understanding Machine Learning: From Theory to Algorithms](https://www.cs.huji.ac.il/~shais/UnderstandingMachineLearning/copy.html) 208 | - :books: [Probability and Statistics](https://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/index.htm) 209 | - :books: [freecodecamp's courses for machine learning](https://www.freecodecamp.org/learn/) 210 | - :bulb: [A quick review of the linear algebra concepts relevant to machine learning.](http://www.deeplearningbook.org/contents/linear_algebra.html) 211 | - :books: [Calculus](http://cs231n.stanford.edu/handouts/derivatives.pdf) 212 | - :books: [Statistical concepts for machine learning](http://www-bcf.usc.edu/~gareth/ISL/) 213 | - :bulb: [AWS Machine Learning Tools](https://www-freecodecamp-org.cdn.ampproject.org/c/s/www.freecodecamp.org/news/aws-machine-learning-tools-the-complete-guide/amp/) 214 | - :green_book: [Introductory Primer](https://www.toptal.com/machine-learning/machine-learning-theory-an-introductory-primer) 215 | - :bulb: [Machine Learning Roadmap](https://whimsical.com/machine-learning-roadmap-2020-CA7f3ykvXpnJ9Az32vYXva) 216 | - :bulb: [Machine Learning Roadmap: A Step by Step Guide](https://www.scaler.com/blog/machine-learning-roadmap/) 217 | - :books: [TinyML Course](https://www.edx.org/professional-certificate/harvardx-tiny-machine-learning) 218 | - :books: [Machine Learning Introduction with Python](https://www.dataquest.io/path/machine-learning-intro-with-python/) 219 | - :speaker: [Machine Learning Guide Podcast](https://ocdevel.com/mlg) 220 | 221 | 222 | * ##### Deep Learning Resources 223 | - :books: [Andrew Ng's Deep Learning Specializations Course](https://www.coursera.org/specializations/deep-learning) 224 | - :green_book: [Convolutional networks CS231n](https://cs231n.github.io/convolutional-networks/) 225 | - :books: [Deep Learning Fundamentals](https://www.dataquest.io/course/deep-learning-fundamentals/) 226 | - :bulb: [Deep learning cheat sheet](https://stanford.edu/~shervine/teaching/cs-229/cheatsheet-deep-learning) 227 | - :books: [Natural Language Processing CS224n](http://web.stanford.edu/class/cs224n/) 228 | - :books: [IBM Deep Learning Course with certification](https://cognitiveclass.ai/courses/introduction-deep-learning) 229 | 230 | 231 | ## MLOPS tools 232 | 233 | * [Tensorflow Extended](https://www.tensorflow.org/tfx) 234 | * [Kubeflow](https://www.kubeflow.org/) 235 | * [Amazon Sagamaker](https://aws.amazon.com/sagemaker/) 236 | * [MLflow](https://mlflow.org/) 237 | 238 | ## Cloud 239 | 240 | * [AWS](https://aws.amazon.com/) 241 | * [Google Cloud](https://cloud.google.com/) 242 | * [Azure](https://azure.microsoft.com/en-us/) 243 | 244 | 245 | ## Wrap Up 246 | If you think the roadmap can be improved, please do open a PR with any updates and submit any issues. Also, I will continue to improve this, so you might want to star this repository to revisit. 247 | Idea from : [Python Developer Roadmap](https://github.com/ErdemOzgen/Python-developer-roadmap) 248 | ## Contribution 249 | The roadmap is built using [Draw.io](https://www.draw.io/). Project file can be found at `DataEngRoadmap.xml` file. To modify it, open draw.io, click **Open Existing Diagram** and choose `xml` file with project. It will open the roadmap for you. Update it, upload and update the images in readme and create a PR (export as png with 400% zoom and minify that with [Compressor.io](https://compressor.io/compress)). 250 | - Open a pull request with improvements 251 | - Discuss ideas in issues 252 | - Spread the word 253 | -------------------------------------------------------------------------------- /welcomes/welcome.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | 5 | func main() { 6 | fmt.Println("Welcome to Data Engineer Roadmap!") 7 | } 8 | -------------------------------------------------------------------------------- /welcomes/welcome.java: -------------------------------------------------------------------------------- 1 | package welcomes; 2 | class welcome { 3 | public static void main(String[] args) { 4 | System.out.println("Welcome to Data Engineer Roadmap!"); 5 | } 6 | } -------------------------------------------------------------------------------- /welcomes/welcome.py: -------------------------------------------------------------------------------- 1 | print("Welcome to Data Engineer Roadmap!") -------------------------------------------------------------------------------- /welcomes/welcome.scala: -------------------------------------------------------------------------------- 1 | object welcome { 2 | def main(args: Array[String]) = { 3 | println("Welcome to Data Engineer Roadmap!") 4 | } 5 | } -------------------------------------------------------------------------------- /welcomes/welcome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Welcome to Data Engineer Roadmap!" --------------------------------------------------------------------------------