├── quickstart_etl ├── assets │ ├── __init__.py │ └── hackernews.py └── __init__.py ├── quickstart_etl_tests ├── __init__.py └── test_assets.py ├── setup.cfg ├── dagster_cloud.yaml ├── pyproject.toml ├── setup.py ├── .gitignore ├── .github └── workflows │ ├── deploy.yml │ └── branch_deployments.yml └── README.md /quickstart_etl/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quickstart_etl_tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /quickstart_etl_tests/test_assets.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = quickstart_etl 3 | -------------------------------------------------------------------------------- /dagster_cloud.yaml: -------------------------------------------------------------------------------- 1 | locations: 2 | - location_name: quickstart_etl 3 | code_source: 4 | package_name: quickstart_etl 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.dagster] 6 | module_name = "quickstart_etl" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="quickstart_etl", 5 | packages=find_packages(exclude=["quickstart_etl_tests"]), 6 | install_requires=[ 7 | "dagster", 8 | "dagster-cloud", 9 | "boto3", 10 | "pandas", 11 | "matplotlib", 12 | "textblob", 13 | "tweepy", 14 | "wordcloud", 15 | ], 16 | extras_require={"dev": ["dagit", "pytest"]}, 17 | ) 18 | -------------------------------------------------------------------------------- /quickstart_etl/__init__.py: -------------------------------------------------------------------------------- 1 | from dagster import ( 2 | Definitions, 3 | ScheduleDefinition, 4 | define_asset_job, 5 | load_assets_from_package_module, 6 | ) 7 | 8 | from . import assets 9 | 10 | daily_refresh_schedule = ScheduleDefinition( 11 | job=define_asset_job(name="all_assets_job"), cron_schedule="0 0 * * *" 12 | ) 13 | 14 | defs = Definitions( 15 | assets=load_assets_from_package_module(assets), schedules=[daily_refresh_schedule] 16 | ) 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Serverless Prod Deployment 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | - "master" 7 | concurrency: 8 | # Cancel in-progress deploys to main branch 9 | group: ${{ github.ref }} 10 | cancel-in-progress: true 11 | env: 12 | DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} 13 | DAGSTER_CLOUD_API_TOKEN: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} 14 | ENABLE_FAST_DEPLOYS: 'true' 15 | 16 | jobs: 17 | dagster_cloud_default_deploy: 18 | name: Dagster Serverless Deploy 19 | runs-on: ubuntu-20.04 20 | outputs: 21 | build_info: ${{ steps.parse-workspace.outputs.build_info }} 22 | 23 | steps: 24 | - name: Parse cloud workspace 25 | if: env.ENABLE_FAST_DEPLOYS != 'true' 26 | id: parse-workspace 27 | uses: dagster-io/dagster-cloud-action/actions/utils/parse_workspace@v0.1 28 | with: 29 | dagster_cloud_file: dagster_cloud.yaml 30 | 31 | - name: Checkout 32 | if: env.ENABLE_FAST_DEPLOYS == 'true' 33 | uses: actions/checkout@v3 34 | with: 35 | ref: ${{ github.head_ref }} 36 | path: project-repo 37 | 38 | - name: Build and deploy Python executable 39 | if: env.ENABLE_FAST_DEPLOYS == 'true' 40 | uses: dagster-io/dagster-cloud-action/actions/build_deploy_python_executable@pex-v0.1 41 | with: 42 | dagster_cloud_file: "$GITHUB_WORKSPACE/project-repo/dagster_cloud.yaml" 43 | build_output_dir: "$GITHUB_WORKSPACE/build" 44 | python_version: "3.8" 45 | env: 46 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 47 | 48 | dagster_cloud_docker_deploy: 49 | name: Dagster Serverless Docker Deploy 50 | runs-on: ubuntu-20.04 51 | if: needs.dagster_cloud_default_deploy.outputs.build_info 52 | needs: dagster_cloud_default_deploy 53 | strategy: 54 | fail-fast: false 55 | matrix: 56 | location: ${{ fromJSON(needs.dagster_cloud_default_deploy.outputs.build_info) }} 57 | steps: 58 | - name: Checkout 59 | uses: actions/checkout@v3 60 | with: 61 | ref: ${{ github.head_ref }} 62 | - name: Build and deploy to Dagster Cloud serverless 63 | uses: dagster-io/dagster-cloud-action/actions/serverless_prod_deploy@v0.1 64 | with: 65 | dagster_cloud_api_token: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} 66 | location: ${{ toJson(matrix.location) }} 67 | # Uncomment to pass through Github Action secrets as a JSON string of key-value pairs 68 | # env_vars: ${{ toJson(secrets) }} 69 | organization_id: ${{ secrets.ORGANIZATION_ID }} 70 | env: 71 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 72 | 73 | -------------------------------------------------------------------------------- /.github/workflows/branch_deployments.yml: -------------------------------------------------------------------------------- 1 | name: Serverless Branch Deployments 2 | on: 3 | pull_request: 4 | types: [opened, synchronize, reopened, closed] 5 | concurrency: 6 | # Cancel in-progress runs on same branch 7 | group: ${{ github.ref }} 8 | cancel-in-progress: true 9 | env: 10 | DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} 11 | DAGSTER_CLOUD_API_TOKEN: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} 12 | ENABLE_FAST_DEPLOYS: 'true' 13 | 14 | jobs: 15 | dagster_cloud_default_deploy: 16 | name: Dagster Serverless Deploy 17 | runs-on: ubuntu-20.04 18 | outputs: 19 | build_info: ${{ steps.parse-workspace.outputs.build_info }} 20 | 21 | steps: 22 | - name: Parse cloud workspace 23 | if: env.ENABLE_FAST_DEPLOYS != 'true' 24 | id: parse-workspace 25 | uses: dagster-io/dagster-cloud-action/actions/utils/parse_workspace@v0.1 26 | with: 27 | dagster_cloud_file: dagster_cloud.yaml 28 | 29 | - name: Checkout 30 | if: env.ENABLE_FAST_DEPLOYS == 'true' 31 | uses: actions/checkout@v3 32 | with: 33 | ref: ${{ github.head_ref }} 34 | path: project-repo 35 | 36 | - name: Build and deploy Python executable 37 | if: env.ENABLE_FAST_DEPLOYS == 'true' 38 | uses: dagster-io/dagster-cloud-action/actions/build_deploy_python_executable@pex-v0.1 39 | with: 40 | dagster_cloud_file: "$GITHUB_WORKSPACE/project-repo/dagster_cloud.yaml" 41 | build_output_dir: "$GITHUB_WORKSPACE/build" 42 | python_version: "3.8" 43 | env: 44 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 45 | 46 | dagster_cloud_docker_deploy: 47 | name: Dagster Serverless Docker Deploy 48 | runs-on: ubuntu-20.04 49 | if: needs.dagster_cloud_default_deploy.outputs.build_info 50 | needs: dagster_cloud_default_deploy 51 | strategy: 52 | fail-fast: false 53 | matrix: 54 | location: ${{ fromJSON(needs.dagster_cloud_default_deploy.outputs.build_info) }} 55 | steps: 56 | - name: Checkout 57 | uses: actions/checkout@v3 58 | with: 59 | ref: ${{ github.head_ref }} 60 | - name: Build and deploy to Dagster Cloud serverless 61 | uses: dagster-io/dagster-cloud-action/actions/serverless_branch_deploy@v0.1 62 | with: 63 | dagster_cloud_api_token: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} 64 | location: ${{ toJson(matrix.location) }} 65 | # Uncomment to pass through Github Action secrets as a JSON string of key-value pairs 66 | # env_vars: ${{ toJson(secrets) }} 67 | organization_id: ${{ secrets.ORGANIZATION_ID }} 68 | env: 69 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 70 | 71 | -------------------------------------------------------------------------------- /quickstart_etl/assets/hackernews.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from io import BytesIO 3 | from typing import List 4 | 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | import requests 8 | from wordcloud import STOPWORDS, WordCloud 9 | 10 | from dagster import MetadataValue, OpExecutionContext, asset 11 | 12 | 13 | @asset(group_name="hackernews", compute_kind="HackerNews API") 14 | def hackernews_topstory_ids() -> List[int]: 15 | """ 16 | Get up to 500 top stories from the HackerNews topstories endpoint. 17 | 18 | API Docs: https://github.com/HackerNews/API#new-top-and-best-stories 19 | """ 20 | newstories_url = "https://hacker-news.firebaseio.com/v0/topstories.json" 21 | top_500_newstories = requests.get(newstories_url).json() 22 | return top_500_newstories 23 | 24 | 25 | @asset(group_name="hackernews", compute_kind="HackerNews API") 26 | def hackernews_topstories( 27 | context: OpExecutionContext, hackernews_topstory_ids: List[int] 28 | ) -> pd.DataFrame: 29 | """ 30 | Get items based on story ids from the HackerNews items endpoint. It may take 1-2 minutes to fetch all 500 items. 31 | 32 | API Docs: https://github.com/HackerNews/API#items 33 | """ 34 | 35 | results = [] 36 | for item_id in hackernews_topstory_ids: 37 | item = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json").json() 38 | results.append(item) 39 | if len(results) % 20 == 0: 40 | context.log.info(f"Got {len(results)} items so far.") 41 | 42 | df = pd.DataFrame(results) 43 | 44 | # Dagster supports attaching arbitrary metadata to asset materializations. This metadata will be 45 | # shown in the run logs and also be displayed on the "Activity" tab of the "Asset Details" page in the UI. 46 | # This metadata would be useful for monitoring and maintaining the asset as you iterate. 47 | # Read more about in asset metadata in https://docs.dagster.io/concepts/assets/software-defined-assets#recording-materialization-metadata 48 | context.add_output_metadata( 49 | { 50 | "num_records": len(df), 51 | "preview": MetadataValue.md(df.head().to_markdown()), 52 | } 53 | ) 54 | return df 55 | 56 | 57 | @asset(group_name="hackernews", compute_kind="Plot") 58 | def hackernews_topstories_word_cloud( 59 | context: OpExecutionContext, hackernews_topstories: pd.DataFrame 60 | ) -> bytes: 61 | """ 62 | Exploratory analysis: Generate a word cloud from the current top 500 HackerNews top stories. 63 | Embed the plot into a Markdown metadata for quick view. 64 | 65 | Read more about how to create word clouds in http://amueller.github.io/word_cloud/. 66 | """ 67 | stopwords = set(STOPWORDS) 68 | stopwords.update(["Ask", "Show", "HN"]) 69 | titles_text = " ".join([str(item) for item in hackernews_topstories["title"]]) 70 | titles_cloud = WordCloud(stopwords=stopwords, background_color="white").generate(titles_text) 71 | 72 | # Generate the word cloud image 73 | plt.figure(figsize=(8, 8), facecolor=None) 74 | plt.imshow(titles_cloud, interpolation="bilinear") 75 | plt.axis("off") 76 | plt.tight_layout(pad=0) 77 | 78 | # Save the image to a buffer and embed the image into Markdown content for quick view 79 | buffer = BytesIO() 80 | plt.savefig(buffer, format="png") 81 | image_data = base64.b64encode(buffer.getvalue()) 82 | md_content = f"})" 83 | 84 | # Attach the Markdown content as metadata to the asset 85 | # Read about more metadata types in https://docs.dagster.io/_apidocs/ops#metadata-types 86 | context.add_output_metadata({"plot": MetadataValue.md(md_content)}) 87 | 88 | return image_data 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dagster starter kit 2 | 3 | This example is a starter kit for building a daily ETL pipeline. At a high level, this project shows how to ingest data from external sources, explore and transform the data, and materialize outputs that help visualize the data. 4 | 5 | *New to Dagster? Learn what Dagster is [in Concepts](https://docs.dagster.io/concepts) or [in the hands-on Tutorials](https://docs.dagster.io/tutorial).* 6 | 7 | This guide covers: 8 | - [Dagster starter kit](#dagster-starter-kit) 9 | - [Introduction](#introduction) 10 | - [Getting started](#getting-started) 11 | - [Option 1: Deploying it on Dagster Cloud](#option-1-deploying-it-on-dagster-cloud) 12 | - [Option 2: Running it locally](#option-2-running-it-locally) 13 | - [Step 1: Materializing assets](#step-1-materializing-assets) 14 | - [Step 2: Viewing and monitoring assets](#step-2-viewing-and-monitoring-assets) 15 | - [Step 3: Scheduling a daily job](#step-3-scheduling-a-daily-job) 16 | - [(Optional) Running daemon locally](#optional-running-daemon-locally) 17 | - [Learning more](#learning-more) 18 | - [Changing the code locally](#changing-the-code-locally) 19 | - [Using environment variables and secrets](#using-environment-variables-and-secrets) 20 | - [Adding new Python dependencies](#adding-new-python-dependencies) 21 | - [Testing](#testing) 22 | 23 | 24 | ## Introduction 25 | 26 | This starter kit includes: 27 | - Basics of creating, connecting, and testing [assets](https://docs.dagster.io/concepts/assets/software-defined-assets) in Dagster. 28 | - Convenient ways to organize and monitor assets, e.g. [grouping assets](https://docs.dagster.io/concepts/assets/software-defined-assets#grouping-assets), [recording asset metadata](https://docs.dagster.io/concepts/assets/software-defined-assets#recording-materialization-metadata), etc. 29 | - A [schedule](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) defined to run a job that generates assets daily. 30 | - [Scaffolded project layout](https://docs.dagster.io/getting-started/create-new-project) that helps you to quickly get started with everything set up. 31 | 32 | 33 | In this project, we're building an analytical pipeline that explores popular topics on HackerNews. 34 | 35 |
36 |
37 |
80 |
81 |
105 |
106 |
113 |
114 |
119 |
120 |
129 |
130 |
135 |
136 |
141 |
142 |
147 |
148 |
153 |
154 |
170 |
171 |
176 |
177 |
184 |
185 |
201 |
202 |
222 |
223 |
231 |
232 |