├── .gitignore ├── Prefect-server-setup.md ├── PyData Amsterdam 24 - Prefect Workshop - Adam Hill.pdf ├── README.md ├── config ├── __init__.py ├── kafka_config.py ├── mongodb_config.py └── redis_config.py ├── data └── airline_tweets.csv ├── docker-compose.yml ├── environment.yml ├── example_create_deployment.py ├── exercises ├── e01_my_first_flow.py ├── e02a_sentiment_pipeline_v1.py ├── e02b_sentiment_pipeline_v2.py ├── e03a_kafka_tweet_publisher.py ├── e03b_kafka_tweet_deployment.py └── e04_sentiment_pipeline_v3.py ├── fastapi_app ├── Dockerfile ├── data │ └── airline_tweets.csv ├── main.py └── requirements.txt ├── images ├── docker-compose-down.png ├── docker-compose-output.png ├── mongo-create-project.png ├── mongodb-atlas.png ├── mongodb-cluster-connect.png ├── mongodb-create-cluster.png ├── mongodb-deploy-cluster.png ├── mongodb-new-collection.png └── mongodb-success-cluster.png ├── notes.md ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── solution ├── __init__.py ├── s01_my_first_flow.py ├── s02a_sentiment_pipeline_v1.py ├── s02b_sentiment_pipeline_v2.py ├── s03a_kafka_tweet_publisher.py ├── s03b_kafka_tweet_deployment.py └── s04_sentiment_pipeline_v3.py ├── streamlit_app ├── Dockerfile ├── Tweets.csv ├── app.py ├── db_connection.py └── requirements.txt ├── test_script.py └── tests ├── __init__.py ├── test_s01.py ├── test_s02b_data_processing.py └── test_s02b_sentiment_analysis.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .DS_Store 165 | -------------------------------------------------------------------------------- /Prefect-server-setup.md: -------------------------------------------------------------------------------- 1 | # Getting the Prefect server up and running 2 | 3 | ## Setup the database 4 | Make sure you have the PostgreSQL DB running using Docker Compose 5 | ``` 6 | docker-compose up --build --force-recreate --remove-orphans 7 | ``` 8 | 9 | ## Setup and launch Prefect 10 | Type the following commands on the command line with the envionment you installed Prefect in active 11 | ``` 12 | prefect server database reset -y 13 | prefect config set PREFECT_API_DATABASE_CONNECTION_URL="postgresql+asyncpg://postgres:password@localhost:5432/prefect_server" 14 | prefect config set PREFECT_API_URL="http://127.0.0.1:4200/api" 15 | prefect config view --show-sources 16 | prefect server start 17 | ``` 18 | 19 | ## Code setup 20 | Need to download `spacy` model that we will be using 21 | python -m spacy download en_core_web_sm -------------------------------------------------------------------------------- /PyData Amsterdam 24 - Prefect Workshop - Adam Hill.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/PyData Amsterdam 24 - Prefect Workshop - Adam Hill.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prefect Workshop - PyData Amsterdam 2024 2 | 3 | To participate fully in this workshop you will need the following: 4 | 1. Clone this repo to your local machine! 5 | 2. A local Python environment with the required libraries installed, I recommend usinga new, clean environment to minimise risk of library conflicts; see details below. 6 | 3. Docker installed on your local machine; see details below for pulling the appropriate docker images needed in advance of the workshop. 7 | 4. Create an account on [MongoDB Atlas](https://www.mongodb.com/cloud/atlas/register) - we will be using the free tier 8 | 5. Test that we have set things up correctly. 9 | 10 | ## 1. Clone this repo 11 | Make a local copy of the codebase. 12 | 13 | ## 2. Building a new Python environment 14 | Hopefully you are familiar with creating new Python environments using your preferred environment manager, to assist you we have provided three different approaches to build an environment that will work with the the materials in the workshop. We will be using a `Python 3.12` environment, it should work with any recent version but we haven't tested it. We will be using the latest release of `Prefect` which is `v3`. 15 | - For `conda` and `mamba` users you can use the included `environment.yml` file to construct a new environment using. 16 | 17 | conda env create -f environment.yml 18 | 19 | - If you are a `poetry` user the the install requirements are captured in the `pyproject.toml` and `poetry.lock` files. 20 | - If you want to install using `pip` you can use the `requirements.txt` file. 21 | 22 | ### 2.1 Pull the spaCy model 23 | During the workshop we will be using a [spaCy](https://spacy.io/models) model for sentiment analysis. This needs to be downloaded after spaCy has been installed. Once your Python enviornment is setup, activate the environment and run the following: 24 | 25 | python -m spacy download en_core_web_sm 26 | 27 | ## 3. Pulling the requisite Docker images 28 | 29 | We will be using Docker to simulate several differnt data services as part of the workshop and se we need to pull some images and build some local images. If you don't have Docker installed then [refer to the Docker install instructions](https://docs.docker.com/get-started/get-docker/). 30 | 31 | - Make sure you have changed directory to the top-level of your clone of this repo where the `docker-compose.yml` file is. 32 | - Create an empty `.env` file in top-level of the directory; you will edit this when setting up MongoDB Atlas. 33 | - Run the following command from a terminal in this directory: 34 | 35 | docker compose up --build --force-recreate --remove-orphans -d 36 | 37 | You should see an out output in the terminal that looks something like: ![docker compose up](images/docker-compose-output.png) 38 | 39 | - Run the command `docker ps` to confirm the 5 services are running 40 | - Stop everything until the workshop by typing: 41 | 42 | docker compose down 43 | 44 | You should see an out output in the terminal that indicates the services have been removed: ![docker compose down](images/docker-compose-down.png) 45 | 46 | ## 4. Setting up a MongoDB Atlas account 47 | 48 | MongoDB offers a cloud service for their document database called Atlas. You can create a free account here: [MongoDB Atlas](https://www.mongodb.com/cloud/atlas/register). You should see something like, 49 | 50 | ![MongoDB Atlas login](images/mongodb-atlas.png) 51 | 52 | ### You need to create a project to hold the database we will be using. You should see something like, 53 | 54 | ![MongoDB Atlas Project](images/mongo-create-project.png) 55 | 56 | ### Create a new project called `Prefect-tutorial` 57 | 58 | ### We then need to create a new cluster and should see a screen like, 59 | 60 | ![MongoDB Atlas Cluster](images/mongodb-create-cluster.png) 61 | 62 | ### If you click on `Create` you will see, 63 | 64 | ![MongoDB Atlas deploy Cluster ](images/mongodb-deploy-cluster.png) 65 | 66 | - Select the `M0` free tier 67 | - Give it the name `Prefect-tutorial` 68 | - Untick `Preload sample dataset` - we will create our own data 69 | - Click `Create Deployment` 70 | - Create a database user and record your password somewhere secure as you will need this later! 71 | 72 | ### Wait a few minutes for the cluster to be deployed. Once it is you should see a screen like, 73 | 74 | ![MongoDB Atlas deploy Cluster success](images/mongodb-success-cluster.png) 75 | 76 | ### Create a new database and collection. By clicking on `Add data` -> `Create Database on Atlas`, 77 | - Call the database; `Prefect-tutorial` 78 | - Call the collection; `sentiment_airline_tweets` 79 | 80 | ![MongoDB Atlas deploy Cluster success](images/mongodb-new-collection.png) 81 | 82 | ### Setup external network access 83 | 84 | - On the left-hand menu select `Network Access` 85 | - Click `Add IP Address` -> `ALLOW ACCESS FROM ANYWHERE`. If you want this to be temporary select the toggle and chose a timeframe so that it will be available on the day of the workshop. 86 | - Click `Confirm` 87 | - On the left-hand menu, under `Deployment`, select `Database`. You should see, 88 | 89 | ![MongoDB Atlas deploy Cluster connect](images/mongodb-cluster-connect.png). 90 | 91 | - Click on `Connect` -> `Drivers`. Make sure `Python` is selected. Copy the connection string that should look something like, `mongodb+srv://:@prefect-tutorial.blah.blah.net/?retryWrites=true&w=majority&appName=Prefect-tutorial` 92 | - In the top-level of the local directory create a file called `.env` and add the following line: 93 | ```MONGO_URI = ``` 94 | 95 | # 5. Run a test of everything ... 96 | 97 | We have a test script to check that we have the correct libraries/modules installed and that MongoDB Atlas has been set-up correctly. Make sure that you have the correct Python environment activated; e.g. if you are a conda user you have run `conda activate pydata24_prefect`. You can then run the test script with: 98 | 99 | python test_script.py 100 | 101 | If eveything is working correctly you should see an output similar to the following: 102 | 103 | ``` 104 | prefect - version 3.0.1 installed 105 | kafka-python-ng - version 2.2.2 installed 106 | spacy - version 3.7.6 installed 107 | spacytextblob - version 4.0.0 installed 108 | pymongo - version 4.8.0 installed 109 | python-dotenv - version (not listed) installed 110 | Pinged your deployment. You successfully connected to MongoDB Atlas! 111 | ``` 112 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/config/__init__.py -------------------------------------------------------------------------------- /config/kafka_config.py: -------------------------------------------------------------------------------- 1 | KAFKA_SERVERS='localhost:9092' 2 | SASL_MECHANISM='SCRAM-SHA-256' 3 | SECURITY_PROTOCOL='SASL_SSL' 4 | SASL_PLAIN_USERNAME='USERNAME' 5 | SASL_PLAIN_PASSWORD='PASSWORD' -------------------------------------------------------------------------------- /config/mongodb_config.py: -------------------------------------------------------------------------------- 1 | MONGO_URI = "mongodb+srv://username:password@prefect-tutorial.blah.mongodb.net/?retryWrites=true&w=majority&appName=Prefect-tutorial" 2 | DB_NAME = "Prefect-tutorial" 3 | 4 | -------------------------------------------------------------------------------- /config/redis_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/config/redis_config.py -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | postgres: 5 | image: postgres:13 6 | container_name: postgres 7 | environment: 8 | - POSTGRES_USER=postgres 9 | - POSTGRES_PASSWORD=password 10 | - POSTGRES_DB=prefect_server 11 | ports: 12 | - "5432:5432" 13 | networks: 14 | - tutorial_network 15 | 16 | fastapi: 17 | build: 18 | context: ./fastapi_app 19 | dockerfile: Dockerfile 20 | container_name: fastapi 21 | ports: 22 | - "8000:8000" 23 | networks: 24 | - tutorial_network 25 | 26 | streamlit-dash: 27 | build: 28 | context: ./streamlit_app 29 | dockerfile: Dockerfile 30 | container_name: streamlit 31 | env_file: 32 | - .env 33 | ports: 34 | - "8501:8501" 35 | networks: 36 | - tutorial_network 37 | 38 | zookeeper: 39 | image: wurstmeister/zookeeper:latest 40 | ports: 41 | - "2181:2181" 42 | networks: 43 | - tutorial_network 44 | healthcheck: 45 | test: echo ruok | nc localhost 2181 | grep imok 46 | interval: 10s 47 | retries: 5 48 | start_period: 30s 49 | timeout: 5s 50 | 51 | kafka: 52 | image: wurstmeister/kafka:latest 53 | depends_on: 54 | zookeeper: 55 | condition: service_healthy 56 | ports: 57 | - "9092:9092" 58 | expose: 59 | - "9093" 60 | environment: 61 | KAFKA_ADVERTISED_LISTENERS: INSIDE://kafka:9093,OUTSIDE://localhost:9092 62 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:PLAINTEXT,OUTSIDE:PLAINTEXT 63 | KAFKA_LISTENERS: INSIDE://0.0.0.0:9093,OUTSIDE://0.0.0.0:9092 64 | KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE 65 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 66 | KAFKA_CREATE_TOPICS: "raw_airline_tweet:1:1" 67 | networks: 68 | - tutorial_network 69 | 70 | networks: 71 | tutorial_network: 72 | driver: bridge 73 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pydata24_prefect_c 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.12 6 | - pip==24.2 7 | - poetry>=1.8.3 8 | - httpx>=0.26.0 9 | - kafka-python-ng>=2.2.2 10 | - pymongo>=4.8.0 11 | - pandas>=2.2.0 12 | - numpy>=1.26.3,<2.0 13 | - dask>=2024.8.1 14 | - nltk=3.8.1 15 | - streamlit>=1.37.1 16 | - wordcloud>=1.9.3 17 | - plotly=5.23.0 18 | - matplotlib>=3.9.2 19 | - python-dotenv>=1.0.1 20 | - pytest>=8.3.2 21 | - ipykernel==6.29.5 22 | - pip: 23 | - prefect==3.0.1 24 | - spacy>=3.7.5 25 | - spacytextblob>=4.0.0 -------------------------------------------------------------------------------- /example_create_deployment.py: -------------------------------------------------------------------------------- 1 | from prefect import flow 2 | 3 | if __name__ == "__main__": 4 | flow.from_source( 5 | source="https://github.com/prefecthq/demos.git", 6 | entrypoint="my_gh_workflow.py:repo_info", 7 | ).deploy( 8 | name="my-first-deployment", 9 | work_pool_name="default-agent-pool", 10 | cron="*/2 * * * *", 11 | ) -------------------------------------------------------------------------------- /exercises/e01_my_first_flow.py: -------------------------------------------------------------------------------- 1 | # Example of processing some data 2 | 3 | 4 | def add(a, b): 5 | return a + b 6 | 7 | 8 | def square_num(num): 9 | return num ** 2 10 | 11 | 12 | def add_and_square(a:int = 2, b:int = 3): 13 | add_result = add(a, b) 14 | square_result = square_num(add_result) 15 | print(f"({a} + {b}) squared = {square_result}") 16 | 17 | if __name__ == "__main__": 18 | add_and_square(4, 8) 19 | -------------------------------------------------------------------------------- /exercises/e02a_sentiment_pipeline_v1.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import re 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | import spacy 8 | from spacytextblob.spacytextblob import SpacyTextBlob 9 | from nltk.stem import WordNetLemmatizer 10 | 11 | nlp = spacy.load('en_core_web_sm') 12 | nlp.add_pipe("spacytextblob") 13 | 14 | wordLemmatizer = WordNetLemmatizer() 15 | 16 | DATA_ROOT = Path("../data") 17 | AIRLINE_FILE = "airline_tweets.csv" 18 | 19 | 20 | def load_airline_tweets(data_path: Path)->pd.DataFrame: 21 | """Load data from the airline tweets dataset""" 22 | df = pd.read_csv(data_path) 23 | return df 24 | 25 | 26 | def calc_sentiment(text: str)->float: 27 | """Calculate text sentiment based on the spacytextblob polarity model""" 28 | doc = nlp(text) 29 | return doc._.blob.polarity 30 | 31 | # Text preprocessing 32 | # Defining dictionary containing all emojis with their meanings. 33 | emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 34 | ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised', 35 | ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 36 | ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy', 37 | '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', 38 | '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 39 | ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'} 40 | 41 | ## Defining set containing all stopwords in english. 42 | stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', 43 | 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', 44 | 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', 45 | 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 46 | 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 47 | 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 48 | 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 49 | 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', 50 | 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', 51 | 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', 52 | 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', 53 | 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 54 | 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 55 | 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', 56 | 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", 57 | "youve", 'your', 'yours', 'yourself', 'yourselves'] 58 | 59 | def lowercase_text(text: str) -> str: 60 | """ 61 | Convert all characters in the text to lowercase and strip leading/trailing whitespace. 62 | 63 | Args: 64 | text (str): The input text to be converted. 65 | 66 | Returns: 67 | str: The text converted to lowercase with leading/trailing whitespace removed. 68 | """ 69 | ### TO BE IMPLEMENTED PROPERLY 70 | return text 71 | 72 | 73 | def strip_url(text: str) -> str: 74 | """ 75 | Replace web addresses in the text with the placeholder 'WEBADDRESS'. 76 | 77 | Args: 78 | text (str): The input text containing URLs. 79 | 80 | Returns: 81 | str: The text with URLs replaced by 'WEBADDRESS'. 82 | """ 83 | pattern = r'(https?://[^\s]+|www\.[^\s]+)' 84 | clean_text = re.sub(pattern, 'WEBADDRESS', text) 85 | return clean_text 86 | 87 | 88 | def strip_user(text: str) -> str: 89 | """ 90 | Replace user handles (mentions) in the text with the placeholder 'USERHANDLE'. 91 | 92 | Args: 93 | text (str): The input text containing user handles. 94 | 95 | Returns: 96 | str: The text with user handles replaced by 'USERHANDLE'. 97 | """ 98 | pattern = r'@[^\s]+' 99 | clean_text = re.sub(pattern, 'USERHANDLE', text) 100 | return clean_text 101 | 102 | 103 | def replace_emoji(text: str) -> str: 104 | """ 105 | Replace emojis in the text with their English word equivalents followed by 'EMOJI'. 106 | 107 | Args: 108 | text (str): The input text containing emojis. 109 | 110 | Returns: 111 | str: The text with emojis replaced by their English word equivalents. 112 | """ 113 | # TO BE IMPLEMENTED 114 | return text 115 | 116 | 117 | def lemmatize_text(text: str) -> str: 118 | """ 119 | Lemmatize words in the text. 120 | 121 | Args: 122 | text (str): The input text containing words to lemmatize. 123 | 124 | Returns: 125 | str: The text with words lemmatized. 126 | """ 127 | words = [wordLemmatizer.lemmatize(word) for word in text.split()] 128 | return ' '.join(words) 129 | 130 | 131 | def process_text(text: str) -> str: 132 | """ 133 | Pre-process text for sentiment analysis by converting to lowercase, stripping URLs and user handles, 134 | replacing emojis, and lemmatizing the text. 135 | 136 | Args: 137 | text (str): The input text to be processed. 138 | 139 | Returns: 140 | str: The processed text. 141 | """ 142 | #text = lowercase_text(text) 143 | text = strip_url(text) 144 | text = strip_user(text) 145 | #text = replace_emoji(text) 146 | text = lemmatize_text(text) 147 | return text 148 | 149 | 150 | 151 | if __name__ == "__main__": 152 | # Perform some rudimentary twitter sentiment analysis 153 | tweets = load_airline_tweets(DATA_ROOT / AIRLINE_FILE) 154 | tweets.loc[:, "calc_sentiment"] = tweets.text.map(calc_sentiment) 155 | print(len(tweets)) 156 | print(tweets.columns) 157 | print(tweets.sample(1)) 158 | for idx, tweet in tweets.sample(10).iterrows(): 159 | clean_text = process_text(tweet["text"]) 160 | doc = nlp(clean_text) 161 | print(tweet["text"]) 162 | print(clean_text) 163 | print(idx, tweet["airline_sentiment"], tweet["text"], doc._.blob.polarity) 164 | print(40*"=") 165 | print(tweets.calc_sentiment.describe()) 166 | print(tweets.groupby("airline_sentiment")["calc_sentiment"].describe()) -------------------------------------------------------------------------------- /exercises/e02b_sentiment_pipeline_v2.py: -------------------------------------------------------------------------------- 1 | from prefect import task, flow, get_run_logger 2 | 3 | import re 4 | import spacy 5 | from spacytextblob.spacytextblob import SpacyTextBlob 6 | from nltk.stem import WordNetLemmatizer 7 | 8 | nlp = spacy.load('en_core_web_sm') 9 | nlp.add_pipe("spacytextblob") 10 | 11 | wordLemmatizer = WordNetLemmatizer() 12 | 13 | 14 | # Text preprocessing 15 | # Defining dictionary containing all emojis with their meanings. 16 | emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 17 | ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised', 18 | ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 19 | ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy', 20 | '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', 21 | '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 22 | ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'} 23 | 24 | ## Defining set containing all stopwords in english. 25 | stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', 26 | 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', 27 | 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', 28 | 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 29 | 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 30 | 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 31 | 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 32 | 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', 33 | 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', 34 | 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', 35 | 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', 36 | 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 37 | 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 38 | 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', 39 | 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", 40 | "youve", 'your', 'yours', 'yourself', 'yourselves'] 41 | 42 | @task(name="Lowercase") 43 | def lowercase_text(text: str) -> str: 44 | """ 45 | Convert all characters in the text to lowercase and strip leading/trailing whitespace. 46 | 47 | Args: 48 | text (str): The input text to be converted. 49 | 50 | Returns: 51 | str: The text converted to lowercase with leading/trailing whitespace removed. 52 | """ 53 | return text.lower().strip() 54 | 55 | # <-- START HERE 56 | # CONVERT THESE FUNCTIONS INTO PREFECT TASKS ... 57 | 58 | def strip_url(text: str) -> str: 59 | """ 60 | Replace web addresses in the text with the placeholder 'WEBADDRESS'. 61 | 62 | Args: 63 | text (str): The input text containing URLs. 64 | 65 | Returns: 66 | str: The text with URLs replaced by 'WEBADDRESS'. 67 | """ 68 | pattern = r'(https?://[^\s]+|www\.[^\s]+)' 69 | clean_text = re.sub(pattern, 'WEBADDRESS', text) 70 | return clean_text 71 | 72 | 73 | def strip_user(text: str) -> str: 74 | """ 75 | Replace user handles (mentions) in the text with the placeholder 'USERHANDLE'. 76 | 77 | Args: 78 | text (str): The input text containing user handles. 79 | 80 | Returns: 81 | str: The text with user handles replaced by 'USERHANDLE'. 82 | """ 83 | pattern = r'@[^\s]+' 84 | clean_text = re.sub(pattern, 'USERHANDLE', text) 85 | return clean_text 86 | 87 | 88 | def replace_emoji(text: str) -> str: 89 | """ 90 | Replace emojis in the text with their English word equivalents followed by 'EMOJI'. 91 | 92 | Args: 93 | text (str): The input text containing emojis. 94 | 95 | Returns: 96 | str: The text with emojis replaced by their English word equivalents. 97 | """ 98 | for emoji, description in emojis.items(): 99 | text = text.replace(emoji, f"{description} EMOJI") 100 | return text 101 | 102 | 103 | def lemmatize_text(text: str) -> str: 104 | """ 105 | Lemmatize words in the text. 106 | 107 | Args: 108 | text (str): The input text containing words to lemmatize. 109 | 110 | Returns: 111 | str: The text with words lemmatized. 112 | """ 113 | words = [wordLemmatizer.lemmatize(word) for word in text.split()] 114 | return ' '.join(words) 115 | 116 | # Now create a data processing flow using the tasks from above! 117 | 118 | def process_text(text: str) -> str: 119 | """ 120 | Pre-process text for sentiment analysis by converting to lowercase, stripping URLs and user handles, 121 | replacing emojis, and lemmatizing the text. 122 | 123 | Args: 124 | text (str): The input text to be processed. 125 | 126 | Returns: 127 | str: The processed text. 128 | """ 129 | logger = get_run_logger() 130 | logger.info("%s Cleaning text: ", text) 131 | # TO BE FILLED IN WITH RELEVANT PROCESSING STEPS 132 | # MAKE SURE YOU HAVE ADDED APPROPRIATE DECORATORS TO YOUR STEPS 133 | # DO YOU WANT TO GIVE THEM NAMES? 134 | # FEEL FREE TO EXPERIMENT 135 | logger.info(f" Clean text: {text}") 136 | return text 137 | 138 | # <- Compulsory edits END here, but inspect the functions below 139 | 140 | @task(name="Calculate Sentiment") 141 | def calc_sentiment(text: str)->float: 142 | """Calculate text sentiment based on the spacytextblob polarity model""" 143 | doc = nlp(text) 144 | return doc._.blob.polarity 145 | 146 | 147 | @flow # EDIT IF YOU WANT 148 | def sentiment_analysis(text: str) -> float: 149 | """Calculate sentiment from cleaned text""" 150 | clean_text = process_text(text) 151 | sentiment = calc_sentiment(clean_text) 152 | print(f"Input text: {text}") 153 | print(f"Sentiment score: {sentiment:0.2f}") 154 | return sentiment 155 | 156 | if __name__ == "__main__": 157 | example = """ This is the worst day ever! @AAirlines is completely screwed. 158 | :( :( - I am not a happy bunny www.worstdayever.com. 159 | """ 160 | sentiment_analysis(example) -------------------------------------------------------------------------------- /exercises/e03a_kafka_tweet_publisher.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaProducer 7 | import httpx 8 | 9 | from config.kafka_config import KAFKA_SERVERS 10 | 11 | AIRLINE_URL = "http://localhost:8000" 12 | KAFKA_TOPIC = "raw_airline_tweet" 13 | 14 | @task # Add name and description as parameters to the task 15 | def publish_to_kafka(json_messages: list[dict], kafka_topic: str): 16 | """ 17 | Publishes a list of JSON messages to the specified Kafka topic. 18 | 19 | Args: 20 | json_messages (List[Dict]): A list of messages in JSON format to be published. 21 | kafka_topic (str): The Kafka topic to publish the messages to. 22 | 23 | Returns: 24 | None 25 | """ 26 | logger = get_run_logger() 27 | logger.info(f"Publishing {len(json_messages)} messages to Kafka topic: {kafka_topic}") 28 | 29 | producer = KafkaProducer( 30 | bootstrap_servers=KAFKA_SERVERS, 31 | value_serializer=lambda v: json.dumps(v).encode('utf-8') 32 | ) 33 | 34 | for message in json_messages: 35 | producer.send(kafka_topic, value=message) 36 | producer.flush() 37 | logger.info(f"Successfully published {len(json_messages)} messages to Kafka topic: {kafka_topic}") 38 | 39 | 40 | @task(name="Fetch Airline Tweet", description="Fetches an airline tweet from the specified URL.", retries=5, retry_delay_seconds=1) 41 | def fetch_airline_tweet(url: str) -> list[dict]: 42 | """ 43 | Fetches an airline tweet from the specified URL. 44 | 45 | Args: 46 | url (str): The URL to fetch the tweet from. 47 | 48 | Returns: 49 | List[Dict]: A list containing the fetched tweet data. 50 | """ 51 | # Create a logger object 52 | # Log to the info channel where you are retrieving the tweet from 53 | 54 | response = httpx.get(url) 55 | response.raise_for_status() 56 | data = response.json() 57 | 58 | if not isinstance(data, list): 59 | data = [data] 60 | 61 | # Log to the info channel the data that you have retrieved 62 | return data 63 | 64 | 65 | @flow # Add name and description as parameters to the flow 66 | def stream_airline_tweet_to_kafka(): 67 | """ 68 | Fetches airline tweets from the API and publishes them to the Kafka topic at regular intervals. 69 | """ 70 | logger = get_run_logger() 71 | url = f"{AIRLINE_URL}/get_tweet" 72 | logger.info(f"Starting stream from {url} to Kafka topic {KAFKA_TOPIC}") 73 | 74 | tweet = fetch_airline_tweet(url) 75 | publish_to_kafka(tweet, KAFKA_TOPIC) 76 | 77 | 78 | if __name__ == "__main__": 79 | stream_airline_tweet_to_kafka() -------------------------------------------------------------------------------- /exercises/e03b_kafka_tweet_deployment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaProducer 7 | import httpx 8 | 9 | from config.kafka_config import KAFKA_SERVERS 10 | 11 | AIRLINE_URL = "http://localhost:8000" 12 | KAFKA_TOPIC = "raw_airline_tweet" 13 | 14 | @task(name="Publish to Kafka", description="Publishes a list of JSON messages to a Kafka topic.") 15 | def publish_to_kafka(json_messages: list[dict], kafka_topic: str): 16 | """ 17 | Publishes a list of JSON messages to the specified Kafka topic. 18 | 19 | Args: 20 | json_messages (List[Dict]): A list of messages in JSON format to be published. 21 | kafka_topic (str): The Kafka topic to publish the messages to. 22 | 23 | Returns: 24 | None 25 | """ 26 | logger = get_run_logger() 27 | logger.info(f"Publishing {len(json_messages)} messages to Kafka topic: {kafka_topic}") 28 | 29 | producer = KafkaProducer( 30 | bootstrap_servers=KAFKA_SERVERS, 31 | value_serializer=lambda v: json.dumps(v).encode('utf-8') 32 | ) 33 | 34 | for message in json_messages: 35 | producer.send(kafka_topic, value=message) 36 | producer.flush() 37 | logger.info(f"Successfully published {len(json_messages)} messages to Kafka topic: {kafka_topic}") 38 | 39 | 40 | @task(name="Fetch Airline Tweet", description="Fetches an airline tweet from the specified URL.", retries=5, retry_delay_seconds=1) 41 | def fetch_airline_tweet(url: str) -> list[dict]: 42 | """ 43 | Fetches an airline tweet from the specified URL. 44 | 45 | Args: 46 | url (str): The URL to fetch the tweet from. 47 | 48 | Returns: 49 | List[Dict]: A list containing the fetched tweet data. 50 | """ 51 | logger = get_run_logger() 52 | logger.info(f"Fetching tweet from URL: {url}") 53 | 54 | response = httpx.get(url) 55 | response.raise_for_status() 56 | data = response.json() 57 | 58 | if not isinstance(data, list): 59 | data = [data] 60 | 61 | logger.info(f"Fetched tweet data: {data}") 62 | return data 63 | 64 | 65 | @flow(name="Stream Airline Tweet to Kafka", description="Fetches airline tweets from an API and publishes them to a Kafka topic.") 66 | def stream_airline_tweet_to_kafka(tweet_url: str, kafka_topic: str): 67 | """ 68 | Fetches airline tweets from the API and publishes them to the Kafka topic at regular intervals. 69 | """ 70 | logger = get_run_logger() 71 | logger.info(f"Starting stream from {tweet_url} to Kafka topic {kafka_topic}") 72 | 73 | tweet = fetch_airline_tweet(tweet_url) 74 | publish_to_kafka(tweet, kafka_topic) 75 | 76 | 77 | if __name__ == "__main__": 78 | # Creates a deployment and stays running to monitor for work instructions generated on the server 79 | # Edit the fields appropriately 80 | stream_airline_tweet_to_kafka.serve( 81 | # ADD THE REQUIRED PARAMETERS HERE ... 82 | ) 83 | -------------------------------------------------------------------------------- /exercises/e04_sentiment_pipeline_v3.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaConsumer 7 | from pymongo import MongoClient 8 | from pymongo.server_api import ServerApi 9 | 10 | from config.kafka_config import KAFKA_SERVERS 11 | from prefect.blocks.system import Secret 12 | from config.mongodb_config import MONGO_URI, DB_NAME 13 | from e02b_sentiment_pipeline_v2 import sentiment_analysis 14 | 15 | KAFKA_TOPIC_AIRLINES = "raw_airline_tweet" 16 | 17 | # DO NOT EDIT 18 | def get_mongo_db(uri: str) -> MongoClient: 19 | """ 20 | Returns a MongoDB client connected to the specified URI. 21 | 22 | Args: 23 | uri (str): The MongoDB connection URI. 24 | 25 | Returns: 26 | MongoClient: The MongoDB client instance. 27 | """ 28 | client = MongoClient(uri, server_api=ServerApi('1')) 29 | return client 30 | 31 | # DO NOT EDIT 32 | def get_kafka_consumer(kafka_topic: str) -> KafkaConsumer: 33 | """ 34 | Returns a Kafka consumer configured for the specified topic. 35 | 36 | Args: 37 | kafka_topic (str): The Kafka topic to consume messages from. 38 | 39 | Returns: 40 | KafkaConsumer: The Kafka consumer instance. 41 | """ 42 | consumer = KafkaConsumer( 43 | kafka_topic, 44 | bootstrap_servers=KAFKA_SERVERS, 45 | group_id="PREFECT-DEV", 46 | auto_offset_reset="earliest", 47 | value_deserializer=lambda x: json.loads(x.decode('utf-8')) 48 | ) 49 | return consumer 50 | 51 | # DO NOT EDIT 52 | @task(name="Write Message to MongoDB", description="Writes a record to MongoDB.") 53 | def write_msg_to_mongo(record: dict, client: MongoClient) -> None: 54 | """ 55 | Writes a record to MongoDB. 56 | 57 | Args: 58 | record (Dict): The record to be inserted into MongoDB. 59 | client (MongoClient): The MongoDB client instance. 60 | 61 | Returns: 62 | None 63 | """ 64 | db = client["Prefect-tutorial"] 65 | collection = db["sentiment_airline_tweets"] 66 | collection.insert_one(record) 67 | 68 | 69 | @task(name="Label Sentiment") # Add a description 70 | def label_sentiment(score: float) -> str: 71 | """ 72 | Labels sentiment based on the sentiment score. 73 | 74 | Args: 75 | score (float): The sentiment score. 76 | 77 | Returns: 78 | str: The sentiment label. 79 | """ 80 | if score > # Set your thresholds: 81 | label = "positive" 82 | elif score < #Set your thresholds: 83 | label = "negative" 84 | else: 85 | label = "neutral" 86 | print(f"Converting sentiment score = {score} to {label}") 87 | return label 88 | 89 | 90 | 91 | @task #Add any parameters you need 92 | def consume_airline_tweets(kafka_topic: str = KAFKA_TOPIC_AIRLINES): 93 | """ 94 | Consumes airline tweets from Kafka, processes them, and stores them in MongoDB. 95 | 96 | Args: 97 | kafka_topic (str): The Kafka topic to consume messages from. 98 | """ 99 | _MONGO_URI = "" # Load the Secret from the password Block 100 | client = get_mongo_db(_MONGO_URI) 101 | consumer = get_kafka_consumer(kafka_topic) 102 | print(f"Starting to consume messages from Kafka topic: {kafka_topic}") 103 | 104 | while True: 105 | poll_result = consumer.poll(timeout_ms=5000) 106 | for _, messages in poll_result.items(): 107 | for msg in messages: 108 | sentiment_score = 0 # You need to process the tweet content field that is in the message, take a look at a message 109 | # example by using your browser at https://localhost:8000/get_tweet 110 | # tip: you need to process msg.value to get to the python dictionary 111 | sentiment_label = label_sentiment(sentiment_score) 112 | new_msg = {}# Take all of the raw message fields and add the additional sentiment data before pushing this to Mongo 113 | # new_msg needs to be a dictionary 114 | write_msg_to_mongo(new_msg, client=client) 115 | print(f"Processed message: {msg.value}") 116 | 117 | @flow(name="Monitor Airline Tweets", description="Monitors airline tweets and processes them every 30 seconds.", log_prints=True) 118 | def monitor_airline_tweets(): 119 | """ 120 | Monitors airline tweets and processes them every 30 seconds. 121 | """ 122 | consume_airline_tweets(kafka_topic=KAFKA_TOPIC_AIRLINES) 123 | 124 | 125 | if __name__ == "__main__": 126 | monitor_airline_tweets() 127 | -------------------------------------------------------------------------------- /fastapi_app/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python 3.11 image as a base 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the requirements file 8 | COPY requirements.txt . 9 | 10 | # Install the required Python packages 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | # Copy the FastAPI app files 14 | COPY main.py . 15 | COPY data/airline_tweets.csv /app/data/airline_tweets.csv 16 | 17 | # Expose the port the app runs on 18 | EXPOSE 8000 19 | 20 | # Run the FastAPI app with uvicorn 21 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /fastapi_app/main.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import random 3 | import time 4 | from typing import Optional, Union 5 | 6 | from fastapi import FastAPI, HTTPException, Request 7 | from fastapi.responses import JSONResponse 8 | from pydantic import BaseModel 9 | import pandas as pd 10 | from fastapi.middleware.cors import CORSMiddleware 11 | from fastapi.middleware.gzip import GZipMiddleware 12 | from datetime import datetime, timedelta 13 | import logging 14 | 15 | # Set up logging 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | def load_data(datapath: Path)->pd.DataFrame: 19 | df = pd.read_csv(datapath) 20 | df.loc[:, "tweet_timestamp"] = pd.to_datetime(df["tweet_created"]) 21 | columns = ["tweet_id", "airline_sentiment", "airline_sentiment_confidence", 22 | "airline", "name", "text", "retweet_count", 23 | "tweet_timestamp", "tweet_coord" 24 | ] 25 | return df[columns].fillna(-1) 26 | 27 | DATAPATH = Path("/app/data") 28 | 29 | app = FastAPI() 30 | 31 | # Rate limiting middleware 32 | last_request_time = datetime.min 33 | 34 | @app.middleware("http") 35 | async def rate_limit_middleware(request: Request, call_next): 36 | global last_request_time 37 | now = datetime.now() 38 | if now - last_request_time < timedelta(seconds=0.1): 39 | return JSONResponse(status_code=429, content={"message": "Rate limit exceeded"}) 40 | last_request_time = now 41 | response = await call_next(request) 42 | return response 43 | 44 | # CORS settings 45 | origins = ["*"] 46 | app.add_middleware(CORSMiddleware, allow_origins=origins, allow_methods=["*"], allow_headers=["*"]) 47 | 48 | # GZIP middleware 49 | app.add_middleware(GZipMiddleware, minimum_size=1000) 50 | 51 | # Load the CSV file 52 | df = load_data(DATAPATH / 'airline_tweets.csv') 53 | num_records = len(df) 54 | current_index = 0 55 | loop_count = 1 56 | 57 | class Tweet(BaseModel): 58 | tweet_id: int 59 | airline_sentiment: str 60 | airline_sentiment_confidence: float 61 | airline: str 62 | name: str 63 | text: str 64 | retweet_count: Optional[int] 65 | tweet_timestamp: datetime 66 | tweet_coord: Optional[Union[tuple[float], float]] 67 | loop: int 68 | 69 | @app.on_event("startup") 70 | async def startup_event(): 71 | global df, num_records, current_index, loop_count 72 | df = load_data(DATAPATH / 'airline_tweets.csv') 73 | num_records = len(df) 74 | current_index = 0 75 | loop_count = 1 76 | 77 | @app.get("/get_tweet", response_model=Tweet) 78 | async def get_tweet(): 79 | global current_index, loop_count 80 | 81 | # Simulate random delay 82 | delay = random.uniform(0.01, 2) 83 | time.sleep(delay) 84 | 85 | # Simulate 25% chance of returning 404 error 86 | if random.random() < 0.25: 87 | raise HTTPException(status_code=404, detail="Tweet not found") 88 | 89 | tweet = df.iloc[current_index] 90 | current_index += 1 91 | 92 | if current_index >= num_records: 93 | current_index = 0 94 | loop_count += 1 95 | 96 | return Tweet(**tweet.to_dict(), loop=loop_count) 97 | -------------------------------------------------------------------------------- /fastapi_app/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | pydantic 4 | pandas 5 | fastparquet -------------------------------------------------------------------------------- /images/docker-compose-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/docker-compose-down.png -------------------------------------------------------------------------------- /images/docker-compose-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/docker-compose-output.png -------------------------------------------------------------------------------- /images/mongo-create-project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongo-create-project.png -------------------------------------------------------------------------------- /images/mongodb-atlas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-atlas.png -------------------------------------------------------------------------------- /images/mongodb-cluster-connect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-cluster-connect.png -------------------------------------------------------------------------------- /images/mongodb-create-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-create-cluster.png -------------------------------------------------------------------------------- /images/mongodb-deploy-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-deploy-cluster.png -------------------------------------------------------------------------------- /images/mongodb-new-collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-new-collection.png -------------------------------------------------------------------------------- /images/mongodb-success-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/images/mongodb-success-cluster.png -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | # Extra commands needed at different stages of the workshop 2 | 3 | ## Environment extra install steps 4 | Need to download `spacy` model that we will be using 5 | python -m spacy download en_core_web_sm 6 | 7 | ## Getting the Prefect server up and running 8 | ### Setup the database 9 | Make sure you have the PostgreSQL DB running using Docker Compose 10 | ``` 11 | docker-compose up --build --force-recreate --remove-orphans 12 | ``` 13 | 14 | ### Setup and launch Prefect 15 | Type the following commands on the command line with the envionment you installed Prefect in active 16 | ``` 17 | prefect server database reset -y 18 | prefect config set PREFECT_API_DATABASE_CONNECTION_URL="postgresql+asyncpg://postgres:password@localhost:5432/prefect_server" 19 | prefect config set PREFECT_API_URL="http://127.0.0.1:4200/api" 20 | prefect config view --show-sources 21 | prefect server start 22 | ``` 23 | 24 | ## Secrets management 25 | ``` 26 | from prefect.blocks.system import Secret 27 | 28 | # Create a MongoDB URI Secret Block 29 | mongo_uri_secret = Secret(value="mongodb+srv://username:password@cluster.mongodb.net/mydatabase") 30 | 31 | # Save the block with a name for later reference 32 | mongo_uri_secret.save(name="mongo-db-uri", overwrite=True) 33 | 34 | mongo_uri = Secret.load("mongo-db-uri").get() 35 | ======== 36 | ``` 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pydata-prefect-workshop" 3 | version = "0.2" 4 | description = "Packages for completing the Prefect practical workshop at PyData" 5 | authors = ["Adam Hill "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.12" 11 | prefect = "^3.0" 12 | kafka-python-ng = "^2.2.2" 13 | httpx = "^0.26.0" 14 | dask = "^2024.8.1" 15 | pymongo = "^4.8.0" 16 | nltk = "^3.8.1" 17 | ipykernel = "^6.29.5" 18 | wordcloud = "^1.9.3" 19 | streamlit = "^1.37.1" 20 | python-dotenv = "^1.0.1" 21 | plotly = "^5.23.0" 22 | matplotlib = "^3.9.2" 23 | pandas = "^2.2.2" 24 | numpy = ">=1.26,<2.0" 25 | pip = "^24.2" 26 | spacy = "^3.7.5" 27 | spacytextblob = "^4.0.0" 28 | 29 | [tool.poetry.group.dev.dependencies] 30 | black = "^24.8.0" 31 | pytest = "^8.3.2" 32 | coverage = "^7.6.1" 33 | pytest-cov = "^5.0.0" 34 | isort = "^5.13.2" 35 | 36 | [build-system] 37 | requires = ["poetry-core"] 38 | build-backend = "poetry.core.masonry.api" 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | prefect==3.0.1 2 | httpx==0.26.0 3 | kafka-python-ng==2.2.2 4 | pymongo==4.8.0 5 | pandas==2.2.0 6 | numpy==1.26.3 7 | dask==2024.8.1 8 | spacy==3.7.5 9 | spacytextblob==4.0.0 10 | nltk==3.8.1 11 | streamlit==1.37.0 12 | wordcloud==1.9.3 13 | plotly==5.23.0 14 | matplotlib==3.9.2 15 | python-dotenv==1.0.1 16 | ipykernel==6.29.5 -------------------------------------------------------------------------------- /solution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/solution/__init__.py -------------------------------------------------------------------------------- /solution/s01_my_first_flow.py: -------------------------------------------------------------------------------- 1 | from prefect import task, flow 2 | 3 | @task(name="Addition operator") 4 | def add(a, b): 5 | return a + b 6 | 7 | @task(name="Squaring function") 8 | def square_num(num): 9 | return num ** 2 10 | 11 | @flow(log_prints=True, name="Demo 1") 12 | def add_and_square(a:int = 2, b:int = 3): 13 | add_result = add(a, b) 14 | square_result = square_num(add_result) 15 | print(f"({a} + {b}) squared = {square_result}") 16 | 17 | if __name__ == "__main__": 18 | add_and_square(4, 8) 19 | -------------------------------------------------------------------------------- /solution/s02a_sentiment_pipeline_v1.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import re 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | import spacy 8 | from spacytextblob.spacytextblob import SpacyTextBlob 9 | from nltk.stem import WordNetLemmatizer 10 | 11 | nlp = spacy.load('en_core_web_sm') 12 | nlp.add_pipe("spacytextblob") 13 | 14 | wordLemmatizer = WordNetLemmatizer() 15 | 16 | DATA_ROOT = Path("../data") 17 | AIRLINE_FILE = "airline_tweets.csv" 18 | 19 | 20 | def load_airline_tweets(data_path: Path)->pd.DataFrame: 21 | """Load data from the airline tweets dataset""" 22 | df = pd.read_csv(data_path) 23 | return df 24 | 25 | 26 | def calc_sentiment(text: str)->float: 27 | """Calculate text sentiment based on the spacytextblob polarity model""" 28 | doc = nlp(text) 29 | return doc._.blob.polarity 30 | 31 | # Text preprocessing 32 | # Defining dictionary containing all emojis with their meanings. 33 | emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 34 | ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised', 35 | ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 36 | ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy', 37 | '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', 38 | '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 39 | ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'} 40 | 41 | ## Defining set containing all stopwords in english. 42 | stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', 43 | 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', 44 | 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', 45 | 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 46 | 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 47 | 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 48 | 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 49 | 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', 50 | 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', 51 | 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', 52 | 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', 53 | 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 54 | 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 55 | 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', 56 | 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", 57 | "youve", 'your', 'yours', 'yourself', 'yourselves'] 58 | 59 | def lowercase_text(text: str) -> str: 60 | """ 61 | Convert all characters in the text to lowercase and strip leading/trailing whitespace. 62 | 63 | Args: 64 | text (str): The input text to be converted. 65 | 66 | Returns: 67 | str: The text converted to lowercase with leading/trailing whitespace removed. 68 | """ 69 | return text.lower().strip() 70 | 71 | 72 | def strip_url(text: str) -> str: 73 | """ 74 | Replace web addresses in the text with the placeholder 'WEBADDRESS'. 75 | 76 | Args: 77 | text (str): The input text containing URLs. 78 | 79 | Returns: 80 | str: The text with URLs replaced by 'WEBADDRESS'. 81 | """ 82 | pattern = r'(https?://[^\s]+|www\.[^\s]+)' 83 | clean_text = re.sub(pattern, 'WEBADDRESS', text) 84 | return clean_text 85 | 86 | 87 | def strip_user(text: str) -> str: 88 | """ 89 | Replace user handles (mentions) in the text with the placeholder 'USERHANDLE'. 90 | 91 | Args: 92 | text (str): The input text containing user handles. 93 | 94 | Returns: 95 | str: The text with user handles replaced by 'USERHANDLE'. 96 | """ 97 | pattern = r'@[^\s]+' 98 | clean_text = re.sub(pattern, 'USERHANDLE', text) 99 | return clean_text 100 | 101 | 102 | def replace_emoji(text: str) -> str: 103 | """ 104 | Replace emojis in the text with their English word equivalents followed by 'EMOJI'. 105 | 106 | Args: 107 | text (str): The input text containing emojis. 108 | 109 | Returns: 110 | str: The text with emojis replaced by their English word equivalents. 111 | """ 112 | for emoji, description in emojis.items(): 113 | text = text.replace(emoji, f"{description} EMOJI") 114 | return text 115 | 116 | 117 | def lemmatize_text(text: str) -> str: 118 | """ 119 | Lemmatize words in the text. 120 | 121 | Args: 122 | text (str): The input text containing words to lemmatize. 123 | 124 | Returns: 125 | str: The text with words lemmatized. 126 | """ 127 | words = [wordLemmatizer.lemmatize(word) for word in text.split()] 128 | return ' '.join(words) 129 | 130 | 131 | def process_text(text: str) -> str: 132 | """ 133 | Pre-process text for sentiment analysis by converting to lowercase, stripping URLs and user handles, 134 | replacing emojis, and lemmatizing the text. 135 | 136 | Args: 137 | text (str): The input text to be processed. 138 | 139 | Returns: 140 | str: The processed text. 141 | """ 142 | text = lowercase_text(text) 143 | text = strip_url(text) 144 | text = strip_user(text) 145 | text = replace_emoji(text) 146 | text = lemmatize_text(text) 147 | return text 148 | 149 | 150 | 151 | if __name__ == "__main__": 152 | # Perform some rudimentary twitter sentiment analysis 153 | tweets = load_airline_tweets(DATA_ROOT / AIRLINE_FILE) 154 | tweets.loc[:, "calc_sentiment"] = tweets.text.map(calc_sentiment) 155 | print(len(tweets)) 156 | print(tweets.columns) 157 | print(tweets.sample(1)) 158 | for idx, tweet in tweets.sample(10).iterrows(): 159 | clean_text = process_text(tweet["text"]) 160 | doc = nlp(clean_text) 161 | print(tweet["text"]) 162 | print(clean_text) 163 | print(idx, tweet["airline_sentiment"], tweet["text"], doc._.blob.polarity) 164 | print(40*"=") 165 | print(tweets.calc_sentiment.describe()) 166 | print(tweets.groupby("airline_sentiment")["calc_sentiment"].describe()) -------------------------------------------------------------------------------- /solution/s02b_sentiment_pipeline_v2.py: -------------------------------------------------------------------------------- 1 | from prefect import task, flow, get_run_logger 2 | 3 | import re 4 | import spacy 5 | from spacytextblob.spacytextblob import SpacyTextBlob 6 | from nltk.stem import WordNetLemmatizer 7 | 8 | nlp = spacy.load('en_core_web_sm') 9 | nlp.add_pipe("spacytextblob") 10 | 11 | wordLemmatizer = WordNetLemmatizer() 12 | 13 | 14 | # Text preprocessing 15 | # Defining dictionary containing all emojis with their meanings. 16 | emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 17 | ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised', 18 | ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 19 | ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy', 20 | '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', 21 | '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 22 | ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'} 23 | 24 | ## Defining set containing all stopwords in english. 25 | stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', 26 | 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', 27 | 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', 28 | 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 29 | 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 30 | 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 31 | 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 32 | 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', 33 | 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', 34 | 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', 35 | 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', 36 | 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 37 | 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 38 | 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', 39 | 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", 40 | "youve", 'your', 'yours', 'yourself', 'yourselves'] 41 | 42 | @task(name="Lowercase") 43 | def lowercase_text(text: str) -> str: 44 | """ 45 | Convert all characters in the text to lowercase and strip leading/trailing whitespace. 46 | 47 | Args: 48 | text (str): The input text to be converted. 49 | 50 | Returns: 51 | str: The text converted to lowercase with leading/trailing whitespace removed. 52 | """ 53 | return text.lower().strip() 54 | 55 | @task(name="Strip URLs") 56 | def strip_url(text: str) -> str: 57 | """ 58 | Replace web addresses in the text with the placeholder 'WEBADDRESS'. 59 | 60 | Args: 61 | text (str): The input text containing URLs. 62 | 63 | Returns: 64 | str: The text with URLs replaced by 'WEBADDRESS'. 65 | """ 66 | pattern = r'(https?://[^\s]+|www\.[^\s]+)' 67 | clean_text = re.sub(pattern, 'WEBADDRESS', text) 68 | return clean_text 69 | 70 | @task(name="Strip User Handles") 71 | def strip_user(text: str) -> str: 72 | """ 73 | Replace user handles (mentions) in the text with the placeholder 'USERHANDLE'. 74 | 75 | Args: 76 | text (str): The input text containing user handles. 77 | 78 | Returns: 79 | str: The text with user handles replaced by 'USERHANDLE'. 80 | """ 81 | pattern = r'@[^\s]+' 82 | clean_text = re.sub(pattern, 'USERHANDLE', text) 83 | return clean_text 84 | 85 | @task(name="Replace Emojis") 86 | def replace_emoji(text: str) -> str: 87 | """ 88 | Replace emojis in the text with their English word equivalents followed by 'EMOJI'. 89 | 90 | Args: 91 | text (str): The input text containing emojis. 92 | 93 | Returns: 94 | str: The text with emojis replaced by their English word equivalents. 95 | """ 96 | for emoji, description in emojis.items(): 97 | text = text.replace(emoji, f"{description} EMOJI") 98 | return text 99 | 100 | @task(name="Lemmatize") 101 | def lemmatize_text(text: str) -> str: 102 | """ 103 | Lemmatize words in the text. 104 | 105 | Args: 106 | text (str): The input text containing words to lemmatize. 107 | 108 | Returns: 109 | str: The text with words lemmatized. 110 | """ 111 | words = [wordLemmatizer.lemmatize(word) for word in text.split()] 112 | return ' '.join(words) 113 | 114 | @flow(retries=3, name="Data Preprocessing") 115 | def process_text(text: str) -> str: 116 | """ 117 | Pre-process text for sentiment analysis by converting to lowercase, stripping URLs and user handles, 118 | replacing emojis, and lemmatizing the text. 119 | 120 | Args: 121 | text (str): The input text to be processed. 122 | 123 | Returns: 124 | str: The processed text. 125 | """ 126 | logger = get_run_logger() 127 | logger.info("%s Cleaning text: ", text) 128 | text = lowercase_text(text) 129 | text = strip_url(text) 130 | text = strip_user(text) 131 | text = replace_emoji(text) 132 | text = lemmatize_text(text) 133 | logger.info(f" Clean text: {text}") 134 | return text 135 | 136 | 137 | @task(name="Calculate Sentiment") 138 | def calc_sentiment(text: str)->float: 139 | """Calculate text sentiment based on the spacytextblob polarity model""" 140 | doc = nlp(text) 141 | return doc._.blob.polarity 142 | 143 | 144 | @flow(name="Sentiment Analysis", log_prints=True) 145 | def sentiment_analysis(text: str) -> str: 146 | clean_text = process_text(text) 147 | sentiment = calc_sentiment(clean_text) 148 | print(f"Input text: {text}") 149 | print(f"Sentiment score: {sentiment:0.2f}") 150 | return sentiment 151 | 152 | if __name__ == "__main__": 153 | example = """ This is the worst day ever! @AAirlines is completely screwed. 154 | :( :( - I am not a happy bunny www.worstdayever.com. 155 | """ 156 | sentiment_analysis(example) -------------------------------------------------------------------------------- /solution/s03a_kafka_tweet_publisher.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaProducer 7 | import httpx 8 | 9 | from config.kafka_config import KAFKA_SERVERS 10 | 11 | AIRLINE_URL = "http://localhost:8000" 12 | KAFKA_TOPIC = "raw_airline_tweet" 13 | 14 | @task(name="Publish to Kafka", description="Publishes a list of JSON messages to a Kafka topic.") 15 | def publish_to_kafka(json_messages: list[dict], kafka_topic: str): 16 | """ 17 | Publishes a list of JSON messages to the specified Kafka topic. 18 | 19 | Args: 20 | json_messages (List[Dict]): A list of messages in JSON format to be published. 21 | kafka_topic (str): The Kafka topic to publish the messages to. 22 | 23 | Returns: 24 | None 25 | """ 26 | logger = get_run_logger() 27 | logger.info(f"Publishing {len(json_messages)} messages to Kafka topic: {kafka_topic}") 28 | 29 | producer = KafkaProducer( 30 | bootstrap_servers=KAFKA_SERVERS, 31 | value_serializer=lambda v: json.dumps(v).encode('utf-8') 32 | ) 33 | 34 | for message in json_messages: 35 | producer.send(kafka_topic, value=message) 36 | producer.flush() 37 | logger.info(f"Successfully published {len(json_messages)} messages to Kafka topic: {kafka_topic}") 38 | 39 | 40 | @task(name="Fetch Airline Tweet", description="Fetches an airline tweet from the specified URL.", retries=5, retry_delay_seconds=1) 41 | def fetch_airline_tweet(url: str) -> list[dict]: 42 | """ 43 | Fetches an airline tweet from the specified URL. 44 | 45 | Args: 46 | url (str): The URL to fetch the tweet from. 47 | 48 | Returns: 49 | List[Dict]: A list containing the fetched tweet data. 50 | """ 51 | logger = get_run_logger() 52 | logger.info(f"Fetching tweet from URL: {url}") 53 | 54 | response = httpx.get(url) 55 | response.raise_for_status() 56 | data = response.json() 57 | 58 | if not isinstance(data, list): 59 | data = [data] 60 | 61 | logger.info(f"Fetched tweet data: {data}") 62 | return data 63 | 64 | 65 | @flow(name="Stream Airline Tweet to Kafka", description="Fetches airline tweets from an API and publishes them to a Kafka topic.") 66 | def stream_airline_tweet_to_kafka(): 67 | """ 68 | Fetches airline tweets from the API and publishes them to the Kafka topic at regular intervals. 69 | """ 70 | logger = get_run_logger() 71 | url = f"{AIRLINE_URL}/get_tweet" 72 | logger.info(f"Starting stream from {url} to Kafka topic {KAFKA_TOPIC}") 73 | 74 | tweet = fetch_airline_tweet(url) 75 | publish_to_kafka(tweet, KAFKA_TOPIC) 76 | 77 | 78 | if __name__ == "__main__": 79 | stream_airline_tweet_to_kafka() -------------------------------------------------------------------------------- /solution/s03b_kafka_tweet_deployment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaProducer 7 | import httpx 8 | 9 | from config.kafka_config import KAFKA_SERVERS 10 | AIRLINE_URL = "http://localhost:8000" 11 | KAFKA_TOPIC = "raw_airline_tweet" 12 | 13 | @task(name="Publish to Kafka", description="Publishes a list of JSON messages to a Kafka topic.") 14 | def publish_to_kafka(json_messages: list[dict], kafka_topic: str): 15 | """ 16 | Publishes a list of JSON messages to the specified Kafka topic. 17 | 18 | Args: 19 | json_messages (List[Dict]): A list of messages in JSON format to be published. 20 | kafka_topic (str): The Kafka topic to publish the messages to. 21 | 22 | Returns: 23 | None 24 | """ 25 | logger = get_run_logger() 26 | logger.info(f"Publishing {len(json_messages)} messages to Kafka topic: {kafka_topic}") 27 | 28 | producer = KafkaProducer( 29 | bootstrap_servers=KAFKA_SERVERS, 30 | value_serializer=lambda v: json.dumps(v).encode('utf-8') 31 | ) 32 | 33 | for message in json_messages: 34 | producer.send(kafka_topic, value=message) 35 | producer.flush() 36 | logger.info(f"Successfully published {len(json_messages)} messages to Kafka topic: {kafka_topic}") 37 | 38 | 39 | @task(name="Fetch Airline Tweet", description="Fetches an airline tweet from the specified URL.", retries=5, retry_delay_seconds=1) 40 | def fetch_airline_tweet(url: str) -> list[dict]: 41 | """ 42 | Fetches an airline tweet from the specified URL. 43 | 44 | Args: 45 | url (str): The URL to fetch the tweet from. 46 | 47 | Returns: 48 | List[Dict]: A list containing the fetched tweet data. 49 | """ 50 | logger = get_run_logger() 51 | logger.info(f"Fetching tweet from URL: {url}") 52 | 53 | response = httpx.get(url) 54 | response.raise_for_status() 55 | data = response.json() 56 | 57 | if not isinstance(data, list): 58 | data = [data] 59 | 60 | logger.info(f"Fetched tweet data: {data}") 61 | return data 62 | 63 | 64 | @flow(name="Stream Airline Tweet to Kafka", description="Fetches airline tweets from an API and publishes them to a Kafka topic.") 65 | def stream_airline_tweet_to_kafka(tweet_url: str, kafka_topic: str): 66 | """ 67 | Fetches airline tweets from the API and publishes them to the Kafka topic at regular intervals. 68 | """ 69 | logger = get_run_logger() 70 | logger.info(f"Starting stream from {tweet_url} to Kafka topic {kafka_topic}") 71 | 72 | tweet = fetch_airline_tweet(tweet_url) 73 | publish_to_kafka(tweet, kafka_topic) 74 | 75 | 76 | if __name__ == "__main__": 77 | # Creates a deployment and stays running to monitor for work instructions generated on the server 78 | stream_airline_tweet_to_kafka.serve( 79 | name="Airline Tweets Stream to Kafka", 80 | tags=["raw", "airline-api", "kafka"], 81 | parameters={ 82 | "tweet_url": AIRLINE_URL+"/get_tweet", 83 | "kafka_topic": KAFKA_TOPIC 84 | }, 85 | interval=0.5 86 | ) -------------------------------------------------------------------------------- /solution/s04_sentiment_pipeline_v3.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | 4 | import json 5 | from prefect import task, flow, get_run_logger 6 | from kafka import KafkaConsumer 7 | from pymongo import MongoClient 8 | from pymongo.server_api import ServerApi 9 | 10 | from config.kafka_config import KAFKA_SERVERS 11 | from prefect.blocks.system import Secret 12 | from config.mongodb_config import MONGO_URI, DB_NAME 13 | from s02b_sentiment_pipeline_v2 import sentiment_analysis 14 | 15 | KAFKA_TOPIC_AIRLINES = "raw_airline_tweet" 16 | MONGO_COLLECTION = "sentiment_airline_tweets" 17 | 18 | 19 | def get_mongo_db(uri: str = MONGO_URI) -> MongoClient: 20 | """ 21 | Returns a MongoDB client connected to the specified URI. 22 | 23 | Args: 24 | uri (str): The MongoDB connection URI. 25 | 26 | Returns: 27 | MongoClient: The MongoDB client instance. 28 | """ 29 | client = MongoClient(uri, server_api=ServerApi('1')) 30 | return client 31 | 32 | 33 | def get_kafka_consumer(kafka_topic: str) -> KafkaConsumer: 34 | """ 35 | Returns a Kafka consumer configured for the specified topic. 36 | 37 | Args: 38 | kafka_topic (str): The Kafka topic to consume messages from. 39 | 40 | Returns: 41 | KafkaConsumer: The Kafka consumer instance. 42 | """ 43 | consumer = KafkaConsumer( 44 | kafka_topic, 45 | bootstrap_servers=KAFKA_SERVERS, 46 | group_id="PREFECT-DEV", 47 | auto_offset_reset="earliest", 48 | value_deserializer=lambda x: json.loads(x.decode('utf-8')) 49 | ) 50 | return consumer 51 | 52 | 53 | @task(name="Write Message to MongoDB", description="Writes a record to MongoDB.") 54 | def write_msg_to_mongo(record: dict, client: MongoClient) -> None: 55 | """ 56 | Writes a record to MongoDB. 57 | 58 | Args: 59 | record (Dict): The record to be inserted into MongoDB. 60 | client (MongoClient): The MongoDB client instance. 61 | 62 | Returns: 63 | None 64 | """ 65 | db = client[DB_NAME] 66 | collection = db[MONGO_COLLECTION] 67 | collection.insert_one(record) 68 | 69 | 70 | @task(name="Label Sentiment", description="Labels sentiment based on the sentiment score.") 71 | def label_sentiment(score: float) -> str: 72 | """ 73 | Labels sentiment based on the sentiment score. 74 | 75 | Args: 76 | score (float): The sentiment score. 77 | 78 | Returns: 79 | str: The sentiment label. 80 | """ 81 | if score > 0.15: 82 | label = "positive" 83 | elif score < -0.15: 84 | label = "negative" 85 | else: 86 | label = "neutral" 87 | print(f"Converting sentiment score = {score} to {label}") 88 | return label 89 | 90 | 91 | 92 | @task(name="Consume Airline Tweets", description="Consumes airline tweets from Kafka, processes them, and stores them in MongoDB.") 93 | def consume_airline_tweets(kafka_topic: str = KAFKA_TOPIC_AIRLINES): 94 | """ 95 | Consumes airline tweets from Kafka, processes them, and stores them in MongoDB. 96 | 97 | Args: 98 | kafka_topic (str): The Kafka topic to consume messages from. 99 | """ 100 | _MONGO_URI = Secret.load("mongo-db-uri").get() 101 | client = get_mongo_db(_MONGO_URI) 102 | consumer = get_kafka_consumer(kafka_topic) 103 | print(f"Starting to consume messages from Kafka topic: {kafka_topic}") 104 | 105 | while True: 106 | poll_result = consumer.poll(timeout_ms=5000) 107 | for _, messages in poll_result.items(): 108 | for msg in messages: 109 | sentiment_score = sentiment_analysis(msg.value.get("text", "")) 110 | sentiment_label = label_sentiment(sentiment_score) 111 | msg.value.update({"sentiment_score": sentiment_score, "sentiment_label": sentiment_label}) 112 | write_msg_to_mongo(msg.value, client=client) 113 | print(f"Processed message: {msg.value}") 114 | 115 | @flow(name="Monitor Airline Tweets", description="Monitors airline tweets and processes them every 30 seconds.", log_prints=True) 116 | def monitor_airline_tweets(): 117 | """ 118 | Monitors airline tweets and processes them every 30 seconds. 119 | """ 120 | consume_airline_tweets(kafka_topic=KAFKA_TOPIC_AIRLINES) 121 | 122 | 123 | if __name__ == "__main__": 124 | monitor_airline_tweets() 125 | -------------------------------------------------------------------------------- /streamlit_app/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python 3.11 image as a base 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | RUN apt-get update && apt-get install -y \ 8 | build-essential \ 9 | curl \ 10 | software-properties-common \ 11 | git \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # Copy the requirements file 15 | COPY requirements.txt . 16 | 17 | # Install the required Python packages 18 | RUN pip install --no-cache-dir -r requirements.txt 19 | 20 | RUN pip install pymongo 21 | 22 | # Copy the Streamlit app files 23 | COPY db_connection.py . 24 | COPY app.py . 25 | COPY Tweets.csv Tweets.csv 26 | 27 | # Expose the port the app runs on 28 | EXPOSE 8501 29 | 30 | # Run the Streamlit app 31 | CMD ["streamlit", "run", "app.py"] -------------------------------------------------------------------------------- /streamlit_app/Tweets.csv: -------------------------------------------------------------------------------- 1 | tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone 2 | 570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada) 3 | 570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada) 4 | 570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada) 5 | 570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces & they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada) 6 | 570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada) 7 | 569611259357863936,neutral,1.0,,,American,,juliezeier,,0,@AmericanAir hung up on now many times trying to cal 8004337300 crazy,,2015-02-22 13:34:41 -0800,, 8 | 569611217301450752,negative,1.0,Can't Tell,0.3555,American,,raseay,,0,@AmericanAir that's quite an impressive list of fees!,,2015-02-22 13:34:31 -0800,"Oxford, MS",Central Time (US & Canada) 9 | 569611065820110848,negative,1.0,Customer Service Issue,0.6696,American,,otisday,,0,"@AmericanAir @GolfWithWoody Don't buy it, Woody. They're making it MUCH worse with understaffing, rudeness, and pre-rookie mistakes",,2015-02-22 13:33:55 -0800,Pekin,Eastern Time (US & Canada) 10 | 569610871934205954,negative,1.0,Can't Tell,0.3658,American,,MrMarioMendoza,,0,@AmericanAir you have to run the engine to troubleshoot an issue before boarding the plane!?! How about another plane? #aa2227 #miatoiah,,2015-02-22 13:33:09 -0800,, 11 | 569610485101768704,negative,0.6941,Flight Booking Problems,0.6941,American,,jmgscott,,0,"“@AmericanAir: @RussellsWriting Russ, please contact Reservations at 800-433-7300 for reFlight Booking Problems options.” Good luck with that!",,2015-02-22 13:31:37 -0800,"Dallas, TX",Central Time (US & Canada) 12 | 569609970007674881,negative,1.0,Cancelled Flight,1.0,American,,KyleKaplan,,0,@AmericanAir flight was Cancelled Flightled can you guys help?,,2015-02-22 13:29:34 -0800,"iPhone: 40.829401,-73.926223",Pacific Time (US & Canada) 13 | 569609887388270592,negative,1.0,Flight Booking Problems,0.363,American,,PredsFran,,0,"@AmericanAir Changed flight from BNA to 2/20 ahead of weather. Slammed me with cost diff because ""no weather advisory.""Really!",,2015-02-22 13:29:14 -0800,Tennessee,Central Time (US & Canada) 14 | 569609699152273409,negative,1.0,Cancelled Flight,1.0,American,,SraJackson,,0,"@AmericanAir my itinerary was from EWR TO DALLAS to LA. You Cancelled Flightled my flight, you have my money, find a way to get me there from EWR",,2015-02-22 13:28:29 -0800,New Jersey,Eastern Time (US & Canada) 15 | 569609590171676675,negative,1.0,Lost Luggage,1.0,American,,TheLoveBite,,0,@AmericanAir an hour at baggage carousel and still no luggage?? Nail in the coffin.,,2015-02-22 13:28:03 -0800,"Los Angeles, CA",Pacific Time (US & Canada) 16 | 569609396461936640,positive,1.0,,,American,,BrophySheen,,0,@AmericanAir Nicest people ever flight to Chicago.Thanks David Deane & Norma Sedholm for making me feel comfortable ✈️✈️✈️✈️✈️✈️✈️,,2015-02-22 13:27:17 -0800,New Jersey, 17 | 569609326253502465,negative,1.0,Customer Service Issue,0.6741,American,,BethMyn,,0,@AmericanAir Can't get thru by phone to use a credit that's about to expire! #frustrated #aa.com,,2015-02-22 13:27:00 -0800,, 18 | 569609163824713729,neutral,1.0,,,American,,nycgypc,,0,@AmericanAir when is next lax JFK flight today,,2015-02-22 13:26:22 -0800,, 19 | 569608733321498627,positive,1.0,,,American,,JuliaWallClarke,,0,"@AmericanAir appreciate update. Have also appreciated our pilots effort to explain to us just now. Accurate, authoritative comms is vital.","[40.6474324, -73.7936881]",2015-02-22 13:24:39 -0800,UK / Canada / etc,Atlantic Time (Canada) 20 | 569608570066436096,negative,1.0,Customer Service Issue,1.0,American,,jddowsett,,0,@AmericanAir now down to only one agent helping a stagnant line of almost 100 customers. Super.,,2015-02-22 13:24:00 -0800,"PA, NY & HI",Quito 21 | 569608446586200064,negative,1.0,Customer Service Issue,1.0,American,,sweetmel,,0,@AmericanAir The bad weather wasn't a surprise! You should have double/triple staff on hand to handle calls. Way to treat your customers.,"[32.97609561, -96.53349238]",2015-02-22 13:23:31 -0800,"Dallas, TX",Central Time (US & Canada) 22 | 569608307184242688,negative,0.7039,Bad Flight,0.3587,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land in identical or worse) conditions at GRK according to METARs.",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada) 23 | 569608282442047488,negative,1.0,Cancelled Flight,1.0,American,,mycreativespark,,0,@AmericanAir flight Cancelled Flighted out of LAX for tomorrow due to connection in DFW. Help please? we can go out of orange or burbank,,2015-02-22 13:22:52 -0800,Ohio,Quito 24 | 569608265623064576,negative,1.0,Flight Attendant Complaints,1.0,American,,igclp,,0,"@AmericanAir first the pilot, then the catering...",,2015-02-22 13:22:47 -0800,, 25 | 569608257607737344,negative,0.6915,Customer Service Issue,0.3566,American,,DanielLassman,,0,".@AmericanAir Also, not the reservations team's fault. Bad top-down decision. I feel bad for the reservations AND social media teams.",,2015-02-22 13:22:46 -0800,"Ocean Ridge, FL",Eastern Time (US & Canada) 26 | 569608190410809344,negative,1.0,Flight Booking Problems,0.3838,American,,igclp,,0,@AmericanAir I would like a refund for this flight. They delayed the flight 5 times. I feel like you're making fun of us.,,2015-02-22 13:22:30 -0800,, 27 | 569607918569410560,negative,1.0,Customer Service Issue,1.0,American,,sweetmel,,0,@AmericanAir @TheNateK I submitted a complaint via that link in Dec. & followed up in Jan. NEVER HEARD BACK! #badcustomerservice,,2015-02-22 13:21:25 -0800,"Dallas, TX",Central Time (US & Canada) 28 | 569607706358779904,positive,1.0,,,American,,stvnoneal,,0,"@americanair Greatest Newark Gate Agents ever: David Deane, Norma Sedholm and Luz Calderon just made me feel like a king. #AmericanAirlines","[40.68621427, -74.1793971]",2015-02-22 13:20:34 -0800,NEW YORK CITY,Eastern Time (US & Canada) 29 | 569607661655715841,negative,1.0,Cancelled Flight,1.0,American,,0veranalyser,,0,"@AmericanAir flights been Cancelled Flightled, can't get through to the desk and nothing showing online under my reservation - what do I do?",,2015-02-22 13:20:23 -0800,,Perth 30 | 569607509167595520,negative,1.0,Can't Tell,0.3703,American,,andrewjh,,0,"@AmericanAir The point of the pic: If your reps struggle with the merger this much, imagine how it is for the public! http://t.co/hQdB5IRuVg",,2015-02-22 13:19:47 -0800,Capitol Hill D.C. ,Eastern Time (US & Canada) 31 | 569607321908862977,negative,0.7038,Flight Booking Problems,0.3613,American,,SraJackson,,0,@AmericanAir flight was 2488 out of EWR STOP AT DALLAS THEN TO LA. I need to be in la tonight!,,2015-02-22 13:19:02 -0800,New Jersey,Eastern Time (US & Canada) 32 | 569607124772196352,negative,1.0,Can't Tell,0.6598,American,,FvGrecia,,0,@AmericanAir I was flying from Ft Lauderdale FL to Seattle WA on the 02/28/2015 until the 03/03/2015 and they don't want to honor my flight.,,2015-02-22 13:18:15 -0800,, 33 | 569606926234824704,negative,1.0,Customer Service Issue,1.0,American,,galen_emery,,0,@AmericanAir wasn't able to get on hold. Your system kept kicking me to the main menu.,,2015-02-22 13:17:28 -0800,"Seattle, WA",Pacific Time (US & Canada) 34 | 569606513859305473,negative,1.0,Customer Service Issue,1.0,American,,EricStride,,0,@americanair Seattle check-in. 1 desk servicing full service line. 2 desks servicing priority. Full srvc wait 30+ minutes. Customer svc fail,,2015-02-22 13:15:50 -0800,, 35 | 569606467646578688,negative,1.0,Customer Service Issue,0.6829999999999999,American,,SraJackson,,0,@AmericanAir hey! Tried calling customer service and was told there's a 2 hour wait. This has been for the past 4 hours. Thanks! You suck!,,2015-02-22 13:15:39 -0800,New Jersey,Eastern Time (US & Canada) 36 | 569606374591746050,negative,0.6684,Lost Luggage,0.3419,American,,RafaelBertorell,,0,@AmericanAir since we are leaving tomorrow to miami can you do one thing right and deliver the bags to the airport?,,2015-02-22 13:15:17 -0800,, 37 | 569606248095576064,negative,1.0,Flight Attendant Complaints,0.3648,American,,jddowsett,,0,@AmericanAir but have been yet to receive assistance from one of your agents in securing a new connection. Many will now miss work tomorrow.,,2015-02-22 13:14:46 -0800,"PA, NY & HI",Quito 38 | 569606226880929792,negative,0.659,Customer Service Issue,0.659,American,,otisday,,0,@AmericanAir @cheerUPDATES So you're saying the call center is understaffed?,,2015-02-22 13:14:41 -0800,Pekin,Eastern Time (US & Canada) 39 | 569606135960858624,neutral,0.6545,,0.0,American,,kelseybiscoe,,0,@AmericanAir I've been trying to change frm AA 2401 to LAX at 6:50am MONDAY morning then AA 2586 from LAX to FAT to flight AA 1359?#helpAA,,2015-02-22 13:14:20 -0800,, 40 | 569606093393010688,neutral,1.0,,,American,,mareyes15,,0,@AmericanAir are flights from Columbus Ohio to Dallas Texas Cancelled Flighted?,,2015-02-22 13:14:10 -0800,,Eastern Time (US & Canada) 41 | 569605935280336896,negative,1.0,Flight Booking Problems,0.7115,American,,otisday,,0,"@AmericanAir @naomi_cooper Are you out of your mind, AA? What is the point of this Twitter acct? Form letters in 140 characters or less?",,2015-02-22 13:13:32 -0800,Pekin,Eastern Time (US & Canada) 42 | 569605930146353152,negative,1.0,Late Flight,1.0,American,,jddowsett,,0,@AmericanAir many have missed connections already b/c of delayed flight which will finally board soon,,2015-02-22 13:13:31 -0800,"PA, NY & HI",Quito 43 | 569605928636407808,neutral,0.6393,,0.0,American,,SmithsAreRare,,0,@AmericanAir On flt 1627 from DFW to FAT on 2/23 and it was 86'd. Can you rebook me on the 1359 flight tonight? Can't get through on phone.,,2015-02-22 13:13:30 -0800,L-Town,Central Time (US & Canada) 44 | 569605877512024064,negative,1.0,Customer Service Issue,0.3469,American,,BDinDallas,,0,@AmericanAir I wasn't important this morning when you would not seat my wife and I together or allow me to choose seats at Flight Booking Problems!!,,2015-02-22 13:13:18 -0800,"Dallas, Texas",Eastern Time (US & Canada) 45 | 569605699308818432,negative,1.0,Customer Service Issue,0.6475,American,,RafaelBertorell,,0,"@AmericanAir i dont believe it, it has been impossible for your agents to get an update from the delivery company since yesterday at 11 am",,2015-02-22 13:12:36 -0800,, 46 | 569605602961281024,negative,1.0,Flight Attendant Complaints,0.66,American,,jddowsett,,0,@AmericanAir yet there are plenty of available AA agents at gates nearby who say they are unable to help customers from our flight.,,2015-02-22 13:12:13 -0800,"PA, NY & HI",Quito 47 | 569605452230754305,negative,1.0,Late Flight,0.6863,American,,otisday,,0,"@AmericanAir @BDinDallas The personal touch you're known for, AA. Other cool perks: blaming understaffing on weather. And 3 hr hold times.",,2015-02-22 13:11:37 -0800,Pekin,Eastern Time (US & Canada) 48 | 569605359406485504,negative,1.0,longlines,1.0,American,,jddowsett,,0,"@AmericanAir been waiting in line for over an hour in San Antonio, barely moved & only two agents. 30 in front of me & at least 40 behind.",,2015-02-22 13:11:15 -0800,"PA, NY & HI",Quito 49 | 569605022197153792,positive,0.6593,,,American,,wesleytravis,,0,@AmericanAir my return flight is scheduled on Wednesday; AA138 I believe. Thanks for the help!,"[44.26313964, -69.78197616]",2015-02-22 13:09:54 -0800,"Webster Groves, MO", 50 | 569604994825105408,negative,1.0,Customer Service Issue,0.7094,American,,Cameratown,,0,@AmericanAir. You neglected to mention the $200 fee per ticket. I had a medical reason and still have had to jump thru hoops.,,2015-02-22 13:09:48 -0800,"Boston, MA",Eastern Time (US & Canada) 51 | 569604909571678208,negative,1.0,Customer Service Issue,1.0,American,,Kaywillsmith,,0,@AmericanAir ....I am on attempt #6 and hour #10 of calling and being put on hold. Not feeling confident that I will ever get through...,,2015-02-22 13:09:27 -0800,The City of Big Shoulders,Central Time (US & Canada) 52 | 569604904689340416,negative,1.0,Customer Service Issue,1.0,American,,JonGilsonAF,,0,"@AmericanAir bought a ticket on @SouthwestAir. After two days, your ""team"" couldn't share the data needed to get me to my next meeting.",,2015-02-22 13:09:26 -0800,"Boston, MA", 53 | 569604777962643457,negative,1.0,Late Flight,1.0,American,,erina_jones,,0,@AmericanAir thanks for the response. I know it's not your fault... But Im in ORD in T5 and hungry if you want to stop by ✈️✌️,,2015-02-22 13:08:56 -0800,London, 54 | 569604674715693057,negative,0.6711,Flight Booking Problems,0.3479,American,,LIGal19,,0,@AmericanAir cut it. Put me on a flt tomorrow.,,2015-02-22 13:08:31 -0800,"ÜT: 40.881241,-73.107717",Quito 55 | 569604659712811009,negative,1.0,Customer Service Issue,1.0,American,,JWinakor,,0,@AmericanAir nah u boofin u dont talk like any humans i know. Respond like u actually have a brain. And again i dont want customer relations,,2015-02-22 13:08:28 -0800,, 56 | 569604466757865472,negative,1.0,Cancelled Flight,0.5057,American,,LIGal19,,0,@AmericanAir I have no transportation to and going to some other city and getting to NY sometime on Wed?! I'm sorry that's not gonna cut,,2015-02-22 13:07:42 -0800,"ÜT: 40.881241,-73.107717",Quito 57 | 569604453462110208,negative,1.0,Lost Luggage,0.6814,American,,agirlnamedfrank,,0,"@AmericanAir because of you, I am doing the one thing I tried to avoid. Thank you for sending me to baggage claim.",,2015-02-22 13:07:39 -0800,Brooklyn,Eastern Time (US & Canada) 58 | 569604425238454273,negative,1.0,Flight Booking Problems,0.5561,American,,LIGal19,,0,@AmericanAir my flt is at 7a tom. I have now rec'd notification that I'm going out from some other airport,,2015-02-22 13:07:32 -0800,"ÜT: 40.881241,-73.107717",Quito 59 | 569604400479649792,neutral,0.6613,,0.0,American,,Kaywillsmith,,0,@AmericanAir how does one book a ticket online and put it on hold? Does that require that I pay for the ticket?,,2015-02-22 13:07:26 -0800,The City of Big Shoulders,Central Time (US & Canada) 60 | 569604354703020032,negative,1.0,Customer Service Issue,1.0,American,,csb2107,,0,@AmericanAir can you help me with a reservation? cant get through on the phone,,2015-02-22 13:07:15 -0800,, 61 | 569604330229092352,negative,1.0,Customer Service Issue,0.6188,American,,CavaTed,,0,@AmericanAir ask the 10 people you left behind at Miami airport because you guys could not wait 5 minutes and ... http://t.co/XPM98Igqjn,,2015-02-22 13:07:09 -0800,Washington DC,Eastern Time (US & Canada) 62 | 569604328153083904,negative,1.0,Late Flight,0.6849,American,,farazq,,1,"@AmericanAir can you do anything to get #AA953 moving? Been almost 24 hrs and hundreds at jfk upset, tired and want to get to BA.","[40.80718573, -73.95477259]",2015-02-22 13:07:09 -0800,"New York, NY",Quito 63 | 569604243226652672,negative,1.0,Can't Tell,0.6826,American,,Texomagal5,,0,@AmericanAir more of the insane treatment by your customers,"[38.5369071, -106.9349375]",2015-02-22 13:06:48 -0800,,Eastern Time (US & Canada) 64 | 569604176960827392,negative,0.705,Customer Service Issue,0.705,American,,gorbell,,0,@americanair Help! I need to speak to a live agent before I lose my online reservation being held.,,2015-02-22 13:06:33 -0800,"Santa Maria, Califoria",Pacific Time (US & Canada) 65 | 569604173236477952,negative,1.0,Customer Service Issue,0.7159,American,,otisday,,0,"@AmericanAir @_emmaclifford No. At JFK you sort of have to guess things out. It's a small airport, so whatever.",,2015-02-22 13:06:32 -0800,Pekin,Eastern Time (US & Canada) 66 | 569604083507556353,negative,1.0,Flight Attendant Complaints,0.6766,American,,Texomagal5,,0,@AmericanAir is this how you let your employees treat your loyal customers? #attackingbabymomma #crazinessintherockies,"[38.538038, -106.9370467]",2015-02-22 13:06:10 -0800,,Eastern Time (US & Canada) 67 | 569603991551782913,positive,0.6401,,,American,,HybridMovementC,,0,@AmericanAir Mad love http://t.co/4ojrSDWPkK NYC-,,2015-02-22 13:05:48 -0800,"everywhere, all the time.", 68 | 569603962233622528,positive,0.3646,,0.0,American,,agirlnamedfrank,,0,"@AmericanAir thanks for forcing me to check -in my carry - on luggage. That is exactly why I spent extra money on ""travel size"" toiletries",,2015-02-22 13:05:41 -0800,Brooklyn,Eastern Time (US & Canada) 69 | 569603590349848576,negative,1.0,Can't Tell,0.6431,American,,otisday,,0,"@AmericanAir @ejacqui If you updated the screens, then people would know the Late Flightst info, good or bad. And that wouldn't work because...",,2015-02-22 13:04:13 -0800,Pekin,Eastern Time (US & Canada) 70 | 569603156927246336,negative,1.0,Flight Booking Problems,0.6964,American,,JasonShaw2,,2,@AmericanAir missing a full days of work thanks guys,,2015-02-22 13:02:29 -0800,Belleville,Eastern Time (US & Canada) 71 | 569603127223066624,negative,1.0,Customer Service Issue,1.0,American,,rebeccabw,,0,@AmericanAir Trying to find out if flight #340 DFW to HOU is on tonight. Exec platinum desk not calling back and online info not clear.,,2015-02-22 13:02:22 -0800,"Dallas, Texas",Central Time (US & Canada) 72 | 569602798867881985,negative,1.0,Customer Service Issue,0.6531,American,,DanielLassman,,0,.@AmericanAir I can't even get on the phone with your reservations team. The system automatically disconnects us.,,2015-02-22 13:01:04 -0800,"Ocean Ridge, FL",Eastern Time (US & Canada) 73 | 569602682505138176,negative,1.0,Flight Booking Problems,0.6538,American,,nubzjubz,,0,@AmericanAir Flight Cancelled Flighted and rebooked but agent made a mistake and booked wrong date! Been trying to get through via phone for hours!!,,2015-02-22 13:00:36 -0800,,Eastern Time (US & Canada) 74 | 569602563261276160,negative,1.0,Customer Service Issue,0.6589,American,,otisday,,0,"@AmericanAir @russelneiss No, American. This is simple. How about YOU call HER back with the info for her new flight. Basic customer service",,2015-02-22 13:00:08 -0800,Pekin,Eastern Time (US & Canada) 75 | 569602295224246272,negative,1.0,Lost Luggage,1.0,American,,GMFujarski,,0,@AmericanAir I have looked and I was told it was in Guatemala last night. No record of anywhere today. I need my clothing for work tomorrow.,,2015-02-22 12:59:04 -0800,, 76 | 569602069335838720,negative,1.0,Customer Service Issue,1.0,American,,otisday,,0,"@AmericanAir Cool, a canned response. Nah, I think I'll keep tweeting at everyone who mentions AA on Twitter. You could DM me, though.",,2015-02-22 12:58:10 -0800,Pekin,Eastern Time (US & Canada) 77 | 569602057285574656,negative,1.0,Flight Booking Problems,0.6626,American,,jweslo,,0,@AmericanAir please fix your mobile and desktop site to allow Canadians to select a passenger when checking in. Thanks.,,2015-02-22 12:58:07 -0800,Brantford, 78 | 569601934853873664,negative,1.0,Customer Service Issue,1.0,American,,rakugojon,,0,@AmericanAir tried ringing but told me to try again Late Flightr. I'm supposed to be leaving in an hour...,,2015-02-22 12:57:38 -0800,San Francisco,London 79 | 569601869598867456,positive,1.0,,,American,,mattlantz,,0,@AmericanAir You guys did an amazing job today! Know it’s hard; thanks to Kate Appleton for all her hard work reFlight Booking Problems my friends and me!,,2015-02-22 12:57:23 -0800,"Tyler, Texas",Central Time (US & Canada) 80 | 569601519663906819,negative,1.0,Late Flight,0.6925,American,,tj_carlson,,0,.@AmericanAir cover a rental car or refund our ticket? - I'll gladly drive 10 hrs home than be stuck here till Tues!,,2015-02-22 12:55:59 -0800,"Brookings, SD",Central Time (US & Canada) 81 | 569601363799359488,negative,1.0,Flight Attendant Complaints,1.0,American,,stevereasnors,,0,@AmericanAir should reconsider #usairways acquisition. Flight 1843 AA gold flyers insulted by attendant for hanging jacket!,,2015-02-22 12:55:22 -0800,Los Angeles,Pacific Time (US & Canada) 82 | 569601354118905859,negative,1.0,Can't Tell,0.3606,American,,ronrisman,,0,@AmericanAir You neglected to mention the $200 fee per ticket. I had a medical reason and still have had to jump thru hoops.,,2015-02-22 12:55:20 -0800,New England,Central Time (US & Canada) 83 | 569601337882755073,neutral,1.0,,,American,,elizabethlawley,,0,@AmericanAir yes and I would like a refund.,,2015-02-22 12:55:16 -0800,Chicago,Central Time (US & Canada) 84 | 569601332040089600,neutral,1.0,,,American,,SamuelWatkin,,0,@AmericanAir how realistic is it to make an 80 minute domestic to international transfer at JFK for a non US citizen?,,2015-02-22 12:55:14 -0800,"Maidstone, UK",London 85 | 569601094239825920,positive,1.0,,,American,,EricRoberts,,1,Eliza & I cheated on u @AmericanAir with @AirTahitiNui & it was a lovely flight. But we'll be back! Lots!,,2015-02-22 12:54:18 -0800,"Los Angeles, CA",Pacific Time (US & Canada) 86 | 569600985007558656,positive,0.6868,,,American,,IoanGil,,0,@AmericanAir I hope you like the photo :) http://t.co/p7fSLuxEGW,,2015-02-22 12:53:52 -0800,,London 87 | 569600927394631680,negative,1.0,Customer Service Issue,0.7282,American,,otisday,,0,"@AmericanAir @sweetmel If weather is bad, wouldn't your folks try extra hard to communicate...or load bags onto flight 1320...or...anything?",,2015-02-22 12:53:38 -0800,Pekin,Eastern Time (US & Canada) 88 | 569600780048560128,negative,1.0,Bad Flight,0.3688,American,,sa_craig,,0,"@AmericanAir and how is this not a mechanical issue? All evidence points to the idea that it’s the ILS at CLL at fault, not the weather.",,2015-02-22 12:53:03 -0800,"College Station, TX",Central Time (US & Canada) 89 | 569600694388445184,negative,1.0,Cancelled Flight,0.6773,American,,otisday,,0,"@AmericanAir @CAexhibitions Sorry in, like, a general way? Or sorry that incompetence/understaffing is compounding weather issues?",,2015-02-22 12:52:42 -0800,Pekin,Eastern Time (US & Canada) 90 | 569600599588630529,neutral,1.0,,,American,,SentieriMelinda,,0,@AmericanAir please help us get home tomorrow!!!!!,,2015-02-22 12:52:20 -0800,, 91 | 569600462661554177,negative,1.0,Customer Service Issue,0.6887,American,,otisday,,0,"@AmericanAir @tennetexan Too bad there's only, like, 3 people on that team, then. Because this is next level unreal.",,2015-02-22 12:51:47 -0800,Pekin,Eastern Time (US & Canada) 92 | 569600295833116672,negative,1.0,Customer Service Issue,1.0,American,,otisday,,0,"@AmericanAir @marxsterbcow How is this a real life response to a customer service issue? Seriously. Keep trying to call us. Eh, might work.",,2015-02-22 12:51:07 -0800,Pekin,Eastern Time (US & Canada) 93 | 569600291290656768,negative,1.0,Bad Flight,0.6709999999999999,American,,shmitty03,,0,@AmericanAir fix the engine of flight AA3031so I don't spend all night in your airport so I can fly home tomorrow,"[33.64199395, -84.44238523]",2015-02-22 12:51:06 -0800,"Long Island, New York ",Quito 94 | 569600137296633856,positive,1.0,,,American,,douglaskgordon,,0,@AmericanAir Thank you.....you do the same!!,,2015-02-22 12:50:30 -0800,"Caribbean, New York and Miami.",Indiana (East) 95 | 569599978722746368,negative,1.0,Customer Service Issue,0.361,American,,otisday,,0,@AmericanAir @ActingOutMgmnt Just make sure they remember to load the bags onto the plane. They Cancelled Flight the flight when they forget...,,2015-02-22 12:49:52 -0800,Pekin,Eastern Time (US & Canada) 96 | 569599379893575680,negative,1.0,Customer Service Issue,0.6599,American,,otisday,,0,"@AmericanAir @brewcrewfan8 Is this real life, AA? Like, 2015 real life? You should be emailing/calling/DMing people you've inconvenienced.",,2015-02-22 12:47:29 -0800,Pekin,Eastern Time (US & Canada) 97 | 569599312671301633,negative,0.6474,Customer Service Issue,0.6474,American,,KeelyPerez,,0,@AmericanAir hey there me again from yesterday im still on hold,,2015-02-22 12:47:13 -0800,NEONgarden,Quito 98 | 569599269189107712,negative,1.0,Customer Service Issue,1.0,American,,tennetexan,,0,@AmericanAir weather is unavoidable. Understaffing is controllable,,2015-02-22 12:47:03 -0800,"Pilot Point, Republic of Texas", 99 | 569599116403036160,negative,1.0,Late Flight,0.6424,American,,skogsbergh,,0,"@AmericanAir Not only was 5418 Late Flight, but we've been boarded and waiting for over 30min. WTF?","[32.90771571, -97.04233322]",2015-02-22 12:46:26 -0800,Lake Arrowhead, 100 | 569598991463268352,negative,0.7257,Flight Attendant Complaints,0.3919,American,,otisday,,0,@AmericanAir @yvonneokaka When do I get my personal response and apology for your crew's having forgotten to load baggage onto my flight?,,2015-02-22 12:45:56 -0800,Pekin,Eastern Time (US & Canada) 101 | 569598942821744640,negative,0.6426,Cancelled Flight,0.6426,American,,KirstKV,,0,@AmericanAir All flts to JFK Cancelled Flightled Thx to UR agent at SFO Im rebooked on UA. Didn't get name. She was awesome! #twitterhug #shesaidrun,,2015-02-22 12:45:45 -0800,,Central Time (US & Canada) 102 | 569598671420919808,negative,1.0,Customer Service Issue,0.6618,American,,jenrpblcn,,0,@AmericanAir I know. After an hour I got a live person. It messes up our arrival and car plans at two airports and is costing us more.,,2015-02-22 12:44:40 -0800,"Seattle, WA",Arizona 103 | 569598544379817984,neutral,0.6917,,0.0,American,,csb2107,,0,@AmericanAir I can't get ahold of aadvantage reservations. I need to ticket a reservation that Cancelled Flights soon. can you help?,,2015-02-22 12:44:10 -0800,, 104 | 569598406181695488,negative,1.0,Customer Service Issue,1.0,American,,jadedhippie09,,0,"@AmericanAir i was also told by agents my issues ""aren't their prob"" K fine. I get it. But have some compassion 4 others dealing w/this!!!",,2015-02-22 12:43:37 -0800,, 105 | 569598314825555968,negative,1.0,Flight Attendant Complaints,0.6282,American,,sigmanu56,,0,@AmericanAir What's the status at DFW? Ticket agent at gate A9 was very rude and unhelpful.,,2015-02-22 12:43:15 -0800,"Natchitoches, LA", 106 | 569598155999674368,negative,1.0,Flight Attendant Complaints,0.3555,American,,jadedhippie09,,0,@AmericanAir i was spoken 2 like I'm an idiot and that is not OK!! I don't need to deal w/ that esp after the travel experience I've had,,2015-02-22 12:42:37 -0800,, 107 | 569597962659110913,neutral,0.7241,,0.0,American,,kelseybiscoe,,0,@AmericanAir I am trying to switch my flight to AA 1359 I am currently on AA 2401 at 6:50am MONDAY morn then AA 2586! Help Me!!,,2015-02-22 12:41:51 -0800,, 108 | 569597885446270976,negative,1.0,Customer Service Issue,0.6379,American,,cataattack,,0,@AmericanAir I really hope it departs. They said is because the catering service wasnt available but we can see it next to plane doing nthin,,2015-02-22 12:41:33 -0800,"Buenos Aires, Argentina",Buenos Aires 109 | 569597742693154816,negative,1.0,Customer Service Issue,0.6733,American,,jadedhippie09,,0,@AmericanAir I understand weather is not your fault but ur cs reps are atrocious. I am NOT happy nor will I EVR fly w/ u again.,,2015-02-22 12:40:59 -0800,, 110 | 569597666025345024,negative,1.0,Customer Service Issue,0.3482,American,,bshelton68,,0,"@AmericanAir When I left Orlando, I was 2nd in line for standby. I land and I'm 4th. 'Priority members get first available seats'. Awesome.",,2015-02-22 12:40:40 -0800,Everett,Arizona 111 | 569597220871282690,negative,1.0,Customer Service Issue,0.6888,American,,sweetmel,,0,@AmericanAir You didn't respond to my DM. You tweeted the same canned tweet you're telling everyone else.,,2015-02-22 12:38:54 -0800,"Dallas, TX",Central Time (US & Canada) 112 | 569597072350978048,negative,1.0,Customer Service Issue,1.0,American,,sweetmel,,0,@AmericanAir I DMed you my AA & phone #s & you can't have someone call me? What was the point of your response? You didn't resolve anything!,,2015-02-22 12:38:19 -0800,"Dallas, TX",Central Time (US & Canada) 113 | 569596793941401601,negative,1.0,Customer Service Issue,0.6521,American,,rdaniel10,,0,@AmericanAir don't you guys have an email address? Just put me on the next available flight from ohare,,2015-02-22 12:37:12 -0800,,Eastern Time (US & Canada) 114 | 569596652622721024,negative,1.0,Late Flight,1.0,American,,billyrobbinscsp,,0,@AmericanAir But Eagle is always Late Flight,,2015-02-22 12:36:39 -0800,,Central Time (US & Canada) 115 | 569596515510956032,negative,0.6694,Customer Service Issue,0.3451,American,,angiedrich,,0,@AmericanAir I have never on all my trips on any airline ever nat'l or int'l ever experienced anything like this!,"[33.93939612, -118.38973148]",2015-02-22 12:36:06 -0800,"Santa Monica, CA",Pacific Time (US & Canada) 116 | 569596502558920707,negative,1.0,Customer Service Issue,1.0,American,,MeereeneseKnot,,0,"@AmericanAir your call center won't let me wait on hold, which I would happily do. Am I seriously supposed to just keep calling? Not great",,2015-02-22 12:36:03 -0800,, 117 | 569596420761604096,negative,1.0,Late Flight,1.0,American,,billyrobbinscsp,,0,@AmericanAir half hour Late Flight leaving DFW...no attempt at an explanation,,2015-02-22 12:35:43 -0800,,Central Time (US & Canada) 118 | 569596325076955137,negative,0.6667,Cancelled Flight,0.6667,American,,nicoeats,,0,@AmericanAir I'm on flight 1027 tomorrow that got Cancelled Flightled. Need to find an alternative to get to Dallas. Please help.,,2015-02-22 12:35:21 -0800,"Tokyo, Japan",Eastern Time (US & Canada) 119 | 569596156927303681,negative,1.0,Late Flight,0.6742,American,,Diane_Lowery,,0,"@AmericanAir 30 minutes flight from OKC and then make us wait, 30 minutes cause the gate isn't empty. #epicfail #poorplanning",,2015-02-22 12:34:41 -0800,Cool suburb N. of Dallas,Central Time (US & Canada) 120 | 569595899204255745,negative,1.0,Customer Service Issue,0.6234,American,,BaburRaja,,0,"@AmericanAir seems like queue times are very high, in Q waiting for an agent 4 almost an hour.Flight got Cancelled Flightled. http://t.co/sDm2wvR3zr",,2015-02-22 12:33:39 -0800,"Chelmsford, MA", 121 | 569595791355981825,negative,1.0,Can't Tell,0.6684,American,,CakeNDeath,,0,"@AmericanAir Nah, just horrible dining options outside of club. Luckily Manuel in the Admirals can make a Bloody Mary.",,2015-02-22 12:33:13 -0800,Old City Philly, 122 | 569595643087486976,neutral,0.6616,,0.0,American,,coffeeculture,,0,@AmericanAir any earlier flights SAP->Mia & Mia -> New York (lga) on 03.03. I'm currently booked on flights 1504 and 1102.,,2015-02-22 12:32:38 -0800,"Dublin, Ireland",Dublin 123 | 569595595754639360,negative,1.0,Customer Service Issue,0.648,American,,SchrierCar,,0,"@AmericanAir no hold times, just disconnections. There is no excuse for that",,2015-02-22 12:32:27 -0800,, 124 | 569595399557685248,negative,1.0,Customer Service Issue,0.6579,American,,rcshore,,0,".@AmericanAir nice 2 know. I paid 4 a seat. Then you sold my seat. now I bought a 3rd seat. It's a good scam, but a scam all the same.",,2015-02-22 12:31:40 -0800,"Washington, DC",Quito 125 | 569595333899997185,negative,0.6384,Customer Service Issue,0.6384,American,,jkordyback,,0,@AmericanAir I’ll play it by ear. I know that you are doing your best. Buy some chewey oatmeal cookies for your customer care folks.,,2015-02-22 12:31:24 -0800,"North Saanich, BC",Pacific Time (US & Canada) 126 | 569595309279440896,neutral,0.6767,,0.0,American,,RafaANieves,,0,"@AmericanAir if business class if full but 1st class empty, do you guys upgrade EXP members to 1st?",,2015-02-22 12:31:18 -0800,,Quito 127 | 569594855556452352,negative,1.0,Customer Service Issue,1.0,American,,jmgscott,,0,@AmericanAir I've been trying for 4 hours to get hold of someone.,,2015-02-22 12:29:30 -0800,"Dallas, TX",Central Time (US & Canada) 128 | 569594712337854464,positive,0.6566,,0.0,American,,Ag03Recruiter,,0,@AmericanAir thank you for quick responses. #aa usually has fantastic customer service. That's why I was so shocked when it wasn't there,,2015-02-22 12:28:56 -0800,"Fort Worth, TX",Central Time (US & Canada) 129 | 569594402114392064,negative,0.7021,Late Flight,0.7021,American,,JasonPeppel,,0,@AmericanAir what are my chances of making a connection to El Paso (AA504) with DFW from SAT (AA200) delayed 30 minutes?,,2015-02-22 12:27:42 -0800,, 130 | 569594123222585344,negative,1.0,Customer Service Issue,1.0,American,,ColourBasis,,0,"@AmericanAir beyond frustrated with no call back from auto hold or whatever you call it. Entered my number at 11:30 CST, still no call 2:26",,2015-02-22 12:26:36 -0800,"National, based in Texas",Central Time (US & Canada) 131 | 569594026732740609,negative,1.0,Customer Service Issue,0.6275,American,,TomGasparetti,,0,@AmericanAir ahhhh your silence is golden now. This tops it all. Anyone get fired?,,2015-02-22 12:26:13 -0800,, 132 | 569593810390425600,negative,0.7094,Customer Service Issue,0.7094,American,,guernicaguy,,0,@AmericanAir Hey AA - can you help with an itinerary for a plat custy? Stuck in PVR and phones aren't working,,2015-02-22 12:25:21 -0800,, 133 | 569593694963310593,negative,0.6541,Customer Service Issue,0.6541,American,,otisday,,0,@AmericanAir @ShannonBloom Where's my DM? Where's my voucher? Who's paying my $70 cab and my $50 car back to JFK tomorrow?,,2015-02-22 12:24:54 -0800,Pekin,Eastern Time (US & Canada) 134 | 569593346223579137,negative,1.0,Customer Service Issue,1.0,American,,catiekate,,0,@AmericanAir we've been on hold for hours.,"[35.22534456, -106.57241352]",2015-02-22 12:23:30 -0800,, 135 | 569593278636675072,negative,1.0,Late Flight,1.0,American,,otisday,,0,@AmericanAir @Stone9956 Do you dislike delays when they're caused by YOUR crew forgetting to load bags & lazy pilot wanting duty day to end?,,2015-02-22 12:23:14 -0800,Pekin,Eastern Time (US & Canada) 136 | 569593050235736064,neutral,1.0,,,American,,WishUpon_26,,0,@AmericanAir can you guys help me please?,,2015-02-22 12:22:20 -0800,KY,Eastern Time (US & Canada) 137 | 569593045777321985,negative,1.0,Cancelled Flight,0.3704,American,,otisday,,0,@AmericanAir @travisamex It's not the weather. It's also gross incompetence. Understaffing. Crew forgetting to load bags. Don't duck truth.,,2015-02-22 12:22:19 -0800,Pekin,Eastern Time (US & Canada) 138 | 569592981742878721,neutral,0.6521,,,American,,EricRoberts,,0,http://t.co/EIw2sYb8Fu roberts&s=1 @AmericanAir,,2015-02-22 12:22:04 -0800,"Los Angeles, CA",Pacific Time (US & Canada) 139 | 569592830307508224,negative,1.0,Late Flight,0.7123,American,,Jess_JCW,,0,@AmericanAir .....and they waited 5 hours in a stuffy plane until they could get off then 7 more hrs to get their luggage #AmericanAirlines,,2015-02-22 12:21:27 -0800,London,London 140 | 569592778872606720,neutral,1.0,,,American,,WishUpon_26,,0,@AmericanAir do you guys have another flight for today that you can book me on from laguardia to louisville ky?,,2015-02-22 12:21:15 -0800,KY,Eastern Time (US & Canada) 141 | 569592674400907264,negative,0.6788,Cancelled Flight,0.6788,American,,SarahM0en,,0,@AmericanAir my flight out of TYR tomorrow was Cancelled Flighted due to weather. How long until a rebook?,,2015-02-22 12:20:50 -0800,Nashville TN,Central Time (US & Canada) 142 | 569592590632247297,negative,0.644,Cancelled Flight,0.644,American,,sbrandongage,,0,@AmericanAir Has AA Flight 296 from San Antonio to Dallas been Cancelled Flighted?,,2015-02-22 12:20:30 -0800,"Pueblo, CO",Eastern Time (US & Canada) 143 | 569592447455465472,negative,1.0,Flight Booking Problems,0.6643,American,,totestoked,,0,@AmericanAir trying to book a flight on hold- can't get through to a representative on the phone- Advice?,,2015-02-22 12:19:56 -0800,"Naperville, IL ",Eastern Time (US & Canada) 144 | 569592402085847041,negative,1.0,Cancelled Flight,1.0,American,,thepin618,,0,@AmericanAir Cancelled Flights flights arbitrarily on same itinerary. Weekend ruined for no good reason! No crew = missed Monday am mtg.,,2015-02-22 12:19:45 -0800,, 145 | 569592270866878464,neutral,1.0,,,American,,WishUpon_26,,0,@AmericanAir i need someone to help me out,,2015-02-22 12:19:14 -0800,KY,Eastern Time (US & Canada) 146 | 569592177312923650,negative,1.0,Cancelled Flight,1.0,American,,WishUpon_26,,0,@AmericanAir my flight was Cancelled Flightled from Laguardia to Louisville Ky and i am stuck at the airport. Do you guys compensate for this?,,2015-02-22 12:18:52 -0800,KY,Eastern Time (US & Canada) 147 | 569592148338876416,negative,1.0,Flight Attendant Complaints,0.708,American,,Jess_JCW,,0,"@AmericanAir & if that wasn't enough, your staff have been so rude & ignored passengers,don't think that should be accepted whatever reason",,2015-02-22 12:18:45 -0800,London,London 148 | 569591765793165312,negative,1.0,longlines,0.6788,American,,Jess_JCW,,0,@AmericanAir I understand the weather issue but you can't expect passengers to wait 24 hours inside airports for whatever reason. Outrageous,,2015-02-22 12:17:14 -0800,London,London 149 | 569591730506371072,neutral,1.0,,,American,,TrueChief77,,0,"@AmericanAir guarantee no retribution? If so, I'd be glad to share.",,2015-02-22 12:17:05 -0800,970 Colorado, 150 | 569591700416393216,negative,1.0,Cancelled Flight,0.6333,American,,tcunningham10,,0,@AmericanAir a friend is having flight Cancelled Flightlations out of LAX to CMH on Feb 23. Anyway to help her? 800 number has been no help,"[40.46692522, -82.64567078]",2015-02-22 12:16:58 -0800,Central Ohio,Eastern Time (US & Canada) 151 | 569591653121597440,negative,1.0,Customer Service Issue,0.7255,American,,kiabeveridge,,0,"@AmericanAir I used the ""call back"" feature with an operator regarding my flight, got a call 2 hours Late Flightr and got hung up on. #pleasehelp",,2015-02-22 12:16:47 -0800,Chicago,Mountain Time (US & Canada) 152 | 569591540944756737,negative,1.0,Customer Service Issue,1.0,American,,GregPoos,,0,"@AmericanAir I need to be at work tomorrow at 8am, therefore that doesn't help. Direct message faster than calling 800 number? #Backwards",,2015-02-22 12:16:20 -0800,, 153 | 569591533617307648,negative,1.0,Cancelled Flight,1.0,American,,tim_sheehy,,0,@AmericanAir ugh Dump us in dfw w/no luggage then Cancelled Flight our flight 3 more times. Sat arrival now Tue?,,2015-02-22 12:16:18 -0800,Washington DC,Central Time (US & Canada) 154 | 569591393540288512,negative,1.0,Cancelled Flight,1.0,American,,TheJoshAbides,,0,"@AmericanAir Cancelled Flights my flight, doesn't send an email, text or call. Then puts me on way earlier flight I might miss now. Thanks AA!",,2015-02-22 12:15:45 -0800,New York City,Eastern Time (US & Canada) 155 | 569591285150908416,positive,1.0,,,American,,iambmac,,0,@AmericanAir DMing you now! Big thanks.,,2015-02-22 12:15:19 -0800,"Columbus, OH, USA",Eastern Time (US & Canada) 156 | 569591136534319105,negative,1.0,Bad Flight,0.6774,American,,A_for_AdNauseam,,0,@AmericanAir 3078 is overweight so you pull 2 dozen passengers off? Why not luggage? Seriously?,,2015-02-22 12:14:44 -0800,, 157 | 569590988395708416,positive,1.0,,,American,,howiemandel,,3,@AmericanAir I love your company and your staff is amazing. They just made an uncomfortable situation comfortable,,2015-02-22 12:14:08 -0800,,Pacific Time (US & Canada) 158 | 569590965880532993,negative,1.0,Customer Service Issue,1.0,American,,KCBobolz,,0,@AmericanAir I wait 2+ hrs for CS to call me back re why flt is cxld/protection & they hang up the minute I answer on 1st ring?,,2015-02-22 12:14:03 -0800,"Milwaukee County, Wisconsin",Central Time (US & Canada) 159 | 569590892085915649,negative,1.0,Customer Service Issue,1.0,American,,andyellwood,,0,"@AmericanAir I've been on hold for 55 mins about my Cancelled Flighted international flight. Am out of country, so can't leave a call back #. Help?",,2015-02-22 12:13:45 -0800,"New York, New York",Eastern Time (US & Canada) 160 | 569590191758962688,negative,1.0,Late Flight,0.3358,American,,Jill_Lynnette,,0,I just need a place to sleep when I land without accommodations in PLS @AmericanAir!,,2015-02-22 12:10:58 -0800,,Eastern Time (US & Canada) 161 | 569590013278756865,positive,0.6274,,0.0,American,,Flora_Lola_NYC,,0,@AmericanAir Love the new planes for the JFK-LAX run. Maybe one day I will be on one where the amenities all function. #NoCharge #Ever,,2015-02-22 12:10:16 -0800,,Eastern Time (US & Canada) 162 | 569589959088173056,negative,1.0,Can't Tell,1.0,American,,yourlama,,0,"@AmericanAir Call me Chairman, or call me Emerald. After what you did today to me, you can call me a former customer.","[32.9070889, -97.03785947]",2015-02-22 12:10:03 -0800,, 163 | 569589643487928321,positive,1.0,,,American,,DrCaseyJRudkin,,0,@AmericanAir Flight 236 was great. Fantastic cabin crew. A+ landing. #thankyou #JFK http://t.co/dRW08djHAI,"[40.64946781, -73.76624703]",2015-02-22 12:08:48 -0800,East Coast, 164 | 569589460226183168,negative,1.0,Late Flight,1.0,American,,cataattack,,0,@AmericanAir Flight 953 NYC-Buenos Aires has been delay since yesterday at 10PM. Is going to take off at 3.30PM now? Give us answers!,,2015-02-22 12:08:04 -0800,"Buenos Aires, Argentina",Buenos Aires 165 | 569588816438169600,negative,1.0,Cancelled Flight,1.0,American,,KristinaMeyer7,,0,"@AmericanAir Flight Cancelled Flightled, can't go home until tomorrow. I could use dinner and a play, @AmericanAir! It's my first time in NYC.",,2015-02-22 12:05:30 -0800,,Eastern Time (US & Canada) 166 | 569588651925098496,positive,1.0,,,American,,jlhalldc,,0,"Thank you. “@AmericanAir: @jlhalldc Customer Relations will review your concerns and contact you back directly, John.”",,2015-02-22 12:04:51 -0800,"Washington, DC",Eastern Time (US & Canada) 167 | 569588591602458624,negative,1.0,Customer Service Issue,1.0,American,,jontgreen89,,0,@AmericanAir How do I change my flight if the phone system keeps telling me that the representatives are busy?,,2015-02-22 12:04:37 -0800,"Waco, TX",Central Time (US & Canada) 168 | 569588473050611712,positive,1.0,,,American,,Laurelinesblog,,0,@AmericanAir Thanks! He is.,,2015-02-22 12:04:09 -0800,"Chapel Hill, NC", 169 | 569588464896876545,negative,1.0,Bad Flight,1.0,American,,MDDavis7,,0,@AmericanAir thx for nothing on getting us out of the country and back to US. Broken plane? Come on. Get another one.,,2015-02-22 12:04:07 -0800,US,Eastern Time (US & Canada) 170 | 569587813856841728,neutral,0.6759999999999999,,0.0,American,,Chad_SMFYM,,0,"“@AmericanAir: @TilleyMonsta George, that doesn't look good. Please follow this link to start the refund process: http://t.co/4gr39s91Dl”😂",,2015-02-22 12:01:31 -0800,,Central Time (US & Canada) 171 | 569587705937600512,negative,1.0,Cancelled Flight,1.0,American,,RussellsWriting,,0,"@AmericanAir my flight was Cancelled Flightled, leaving tomorrow morning. Auto rebooked for a Tuesday night flight but need to arrive Monday.",,2015-02-22 12:01:06 -0800,Los Angeles,Arizona 172 | 569587691626622976,negative,0.6684,Late Flight,0.6684,American,,GolfWithWoody,,0,@AmericanAir right on cue with the delays👌,,2015-02-22 12:01:02 -0800,,Quito 173 | 569587686496825344,positive,0.3487,,0.0,American,,KristenReenders,,0,@AmericanAir thank you we got on a different flight to Chicago.,,2015-02-22 12:01:01 -0800,, 174 | 569587371693355008,negative,1.0,Customer Service Issue,1.0,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flight. No warnings or communication until we were 15 minutes Late Flight. That's called shitty customer svc,,2015-02-22 11:59:46 -0800,Texas, 175 | 569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to #BlackBerry10,,2015-02-22 11:59:15 -0800,"Nigeria,lagos", 176 | 569587188687634433,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada) 177 | 569587140490866689,neutral,0.6771,,0.0,American,,daviddtwu,,0,@AmericanAir we have 8 ppl so we need 2 know how many seats are on the next flight. Plz put us on standby for 4 people on the next flight?,,2015-02-22 11:58:51 -0800,"dallas, TX", 178 | -------------------------------------------------------------------------------- /streamlit_app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import plotly.express as px 4 | from wordcloud import WordCloud, STOPWORDS 5 | import matplotlib.pyplot as plt 6 | from db_connection import fetch_tweets_data 7 | import time 8 | 9 | TIME_ORDER_FIELD = "tweet_timestamp" 10 | 11 | st.title("Sentiment Analysis of Tweets about US Airlines") 12 | st.sidebar.title("Sentiment Analysis Controls") 13 | 14 | 15 | # Cache the data, but without TTL due to persistence 16 | @st.cache_data(persist=True) 17 | def load_data(): 18 | data = fetch_tweets_data() 19 | return data 20 | 21 | # Function to calculate the change in sentiment percentages 22 | def calculate_sentiment_change(current_data, previous_data): 23 | current_sentiment_count = current_data['airline_sentiment'].value_counts(normalize=True) * 100 24 | previous_sentiment_count = previous_data['airline_sentiment'].value_counts(normalize=True) * 100 25 | 26 | # Get percentage change 27 | change = current_sentiment_count - previous_sentiment_count 28 | return current_sentiment_count, change 29 | 30 | # Initialize session state to store previous data if it doesn't exist 31 | if 'previous_data' not in st.session_state: 32 | st.session_state.previous_data = pd.DataFrame() 33 | 34 | # Add a "Reload" button to refresh the data manually 35 | if st.sidebar.button("Reload Data"): 36 | # Clear cached data to force a refresh 37 | st.cache_data.clear() 38 | 39 | # Load the data 40 | current_data = load_data() 41 | 42 | # Track total number of tweets and sentiment changes 43 | total_tweets = len(current_data) 44 | if not st.session_state.previous_data.empty: 45 | # Calculate the difference in total number of tweets 46 | previous_tweets = len(st.session_state.previous_data) 47 | tweet_increase = total_tweets - previous_tweets 48 | 49 | # Calculate sentiment changes 50 | current_sentiment, sentiment_change = calculate_sentiment_change(current_data, st.session_state.previous_data) 51 | 52 | # Display metrics in a horizontal row using st.columns 53 | col1, col2, col3, col4 = st.columns(4) 54 | col1.metric("Total Tweets", total_tweets, f"+{tweet_increase}") 55 | col2.metric("Positive Tweets", f"{current_sentiment.get('positive', 0):.2f}%", f"{sentiment_change.get('positive', 0):+.2f}%") 56 | col3.metric("Neutral Tweets", f"{current_sentiment.get('neutral', 0):.2f}%", f"{sentiment_change.get('neutral', 0):+.2f}%") 57 | col4.metric("Negative Tweets", f"{current_sentiment.get('negative', 0):.2f}%", f"{sentiment_change.get('negative', 0):+.2f}%") 58 | else: 59 | # Display initial metrics when no previous data is available 60 | current_sentiment = current_data['airline_sentiment'].value_counts(normalize=True) * 100 61 | 62 | # Display initial metrics with no deltas 63 | col1, col2, col3, col4 = st.columns(4) 64 | col1.metric("Total Tweets", total_tweets) 65 | col2.metric("Positive Tweets", f"{current_sentiment.get('positive', 0):.2f}%") 66 | col3.metric("Neutral Tweets", f"{current_sentiment.get('neutral', 0):.2f}%") 67 | col4.metric("Negative Tweets", f"{current_sentiment.get('negative', 0):.2f}%") 68 | 69 | # Update session state to store the current data for the next refresh 70 | st.session_state.previous_data = current_data 71 | 72 | # Sidebar settings 73 | st.sidebar.markdown("This application is a Streamlit dashboard to analyse the sentiment of Tweets") 74 | 75 | st.sidebar.subheader("Show random tweet") 76 | random_tweet = st.sidebar.radio("Sentiment", ("positive", "negative", "neutral")) 77 | tweet_text = current_data.query("sentiment_label == @random_tweet")[["text"]].sample(1).iat[0, 0] 78 | st.sidebar.markdown(tweet_text) 79 | 80 | # Create tabs for the main visualizations 81 | tabs = st.tabs(["Sentiment Overview", "Airline Breakdown", "Word Cloud", "Sampled Tweets"]) 82 | 83 | with tabs[0]: 84 | # Sentiment Overview: Number of tweets by sentiment 85 | st.markdown("### Number of tweets by sentiment") 86 | select = st.selectbox("Visualisation type", ["Histogram", "Pie chart"], key="1") 87 | sentiment_count = current_data['sentiment_label'].value_counts() 88 | sentiment_count = pd.DataFrame({"Sentiment": sentiment_count.index, "Tweets": sentiment_count.values}) 89 | 90 | if select == "Histogram": 91 | fig = px.bar(sentiment_count, x='Sentiment', y='Tweets', color="Tweets") 92 | else: 93 | fig = px.pie(sentiment_count, values="Tweets", names="Sentiment") 94 | st.plotly_chart(fig) 95 | 96 | with tabs[1]: 97 | # Airline Breakdown: Breakdown airline tweets by sentiment 98 | st.markdown("### Breakdown airline tweets by sentiment") 99 | choice = st.multiselect('Pick airlines', tuple(current_data.airline.unique()), key="0") 100 | 101 | if len(choice) > 0: 102 | choice_data = current_data[current_data.airline.isin(choice)] 103 | fig_choice = px.histogram(choice_data, x='airline', y='sentiment_label', histfunc='count', color='airline_sentiment', 104 | facet_col='sentiment_label', labels={'sentiment_label': 'tweets'}, height=600, width=800) 105 | st.plotly_chart(fig_choice) 106 | 107 | with tabs[2]: 108 | # Word Cloud: Display word clouds for different sentiments 109 | st.markdown("### Word Cloud for Selected Sentiment") 110 | word_sentiment = st.radio("Display word cloud for which sentiment?", ("positive", "negative", "neutral")) 111 | 112 | df = current_data[current_data['sentiment_label'] == word_sentiment] 113 | words = ' '.join(df['text']) 114 | processed_words = ' '.join([word for word in words.split() if "http" not in word and not word.startswith("@") and word != "RT"]) 115 | wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=640, width=800).generate(processed_words) 116 | plt.imshow(wordcloud) 117 | plt.xticks([]) 118 | plt.yticks([]) 119 | fig_wc = plt.gcf() 120 | st.pyplot(fig_wc) 121 | 122 | with tabs[3]: 123 | # Sampled Tweets: Display a random sample of up to 10 tweets as a dataframe or JSON 124 | st.markdown("### Sampled Tweets") 125 | 126 | # Radio button for choosing between dataframe or JSON 127 | view_format = st.radio("View format:", ("DataFrame", "JSON"), index=0) 128 | 129 | # Sample up to 10 tweets from the data 130 | sampled_tweets = current_data.sample(10) 131 | 132 | # Display tweets in the selected format 133 | if view_format == "DataFrame": 134 | st.dataframe(sampled_tweets) 135 | else: 136 | st.json(sampled_tweets.to_dict(orient='records')) -------------------------------------------------------------------------------- /streamlit_app/db_connection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pymongo import MongoClient 3 | from dotenv import load_dotenv 4 | import pandas as pd 5 | 6 | # Load environment variables from .env file 7 | load_dotenv() 8 | 9 | def get_mongo_client(): 10 | # Get the MongoDB URI from environment variables 11 | mongo_uri = os.getenv("MONGO_URI") 12 | client = MongoClient(mongo_uri) 13 | return client 14 | 15 | def fetch_tweets_data(): 16 | client = get_mongo_client() 17 | db = client['Prefect-tutorial'] 18 | collection = db['sentiment_airline_tweets'] 19 | 20 | # Fetch the data 21 | tweets_data = collection.find() 22 | 23 | # Convert the MongoDB cursor to a Pandas DataFrame 24 | df = pd.DataFrame(list(tweets_data)) 25 | 26 | # Ensure proper datetime conversion for the 'tweet_created' field 27 | df['tweet_timestamp'] = pd.to_datetime(df['tweet_timestamp']) 28 | 29 | return df 30 | -------------------------------------------------------------------------------- /streamlit_app/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.30.0 2 | pandas==2.2.0 3 | wordcloud==1.9.3 4 | plotly==5.22.0 5 | matplotlib==3.8.4 6 | python-dotenv -------------------------------------------------------------------------------- /test_script.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def check_import(module_name, install_alias=None): 4 | try: 5 | if install_alias is not None: 6 | name = install_alias 7 | else: 8 | name = module_name 9 | module = __import__(module_name) 10 | print(f"{name} - version {module.__dict__.get('__version__', '(not listed)')} installed") 11 | return module 12 | except ImportError: 13 | print(f"***** WARNING ***** {name} is not installed") 14 | return None 15 | 16 | def check_spacy(): 17 | try: 18 | import spacy 19 | from spacytextblob.spacytextblob import SpacyTextBlob 20 | 21 | nlp = spacy.load("en_core_web_sm") 22 | nlp.add_pipe("spacytextblob") 23 | 24 | text = "This workshop on Prefect by Adam is going to be awesome!" 25 | doc = nlp(text) 26 | 27 | # Check entities 28 | try: 29 | assert "Prefect" in [ent.text for ent in doc.ents], "Entities extraction failed" 30 | assert "Adam" in [ent.text for ent in doc.ents], "Entities extraction failed" 31 | except AssertionError as e: 32 | print(f"***** WARNING ***** {e}") 33 | 34 | # Check polarity 35 | try: 36 | assert doc._.blob.polarity == 1, "Text polarity is incorrect" 37 | except AssertionError as e: 38 | print(f"***** WARNING ***** {e}") 39 | except ModuleNotFoundError as e: 40 | print(f"***** WARNING ***** {e}") 41 | except ImportError as e: 42 | print(f"***** WARNING ***** {str(e).split(':')[1].strip()} is not installed") 43 | except OSError: 44 | print("***** WARNING ***** Spacy model not installed! Please run: 'python -m spacy download en_core_web_sm'") 45 | 46 | def check_mongodb(): 47 | try: 48 | from dotenv import dotenv_values 49 | import pymongo 50 | from pymongo.server_api import ServerApi 51 | 52 | config = dotenv_values(".env") 53 | client = pymongo.MongoClient(config.get("MONGO_URI"), server_api=ServerApi('1')) 54 | client.admin.command("ping") 55 | print("Pinged your deployment. You successfully connected to MongoDB Atlas!") 56 | except ModuleNotFoundError as e: 57 | print(f"***** WARNING ***** {e}") 58 | except ImportError as e: 59 | print(f"***** WARNING ***** {str(e).split(':')[1].strip()} is not installed") 60 | except pymongo.errors.ServerSelectionTimeoutError: 61 | print("***** WARNING ***** Cannot connect to MongoDB server. Check your MONGO_URI in the .env file.") 62 | except Exception as e: 63 | print(f"***** WARNING ***** {e}") 64 | 65 | if __name__ == "__main__": 66 | # Check dependencies 67 | check_import('prefect') 68 | check_import('kafka', 'kafka-python-ng') 69 | check_import('spacy') 70 | check_import('spacytextblob') 71 | check_import('pymongo') 72 | check_import('dotenv', 'python-dotenv') 73 | 74 | # Check Spacy and MongoDB 75 | check_spacy() 76 | check_mongodb() 77 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cadarn/PyData-Prefect-Workshop/dd8374cbacc541f7db48356e9d020917db58ad3b/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_s01.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from solution.s01_my_first_flow import add, square_num, add_and_square 3 | 4 | # Reusable pytest fixture to provide test data 5 | @pytest.fixture 6 | def number_data(): 7 | return { 8 | "a": 2, 9 | "b": 3, 10 | "expected_sum": 5, 11 | "expected_sum_square": 25 12 | } 13 | 14 | # Test the `add` task directly using core logic 15 | def test_add(number_data): 16 | result = add.fn(number_data["a"], number_data["b"]) # Bypassing Prefect's task layer 17 | assert result == number_data["expected_sum"], f"Expected {number_data['expected_sum']}, got {result}" 18 | 19 | # Test the `square_num` task directly 20 | def test_square_num(number_data): 21 | result = square_num.fn(number_data["expected_sum"]) # Passing the result from `add` 22 | assert result == number_data["expected_sum_square"], f"Expected {number_data['expected_sum_square']}, got {result}" 23 | 24 | 25 | # Test the flow directly, bypassing the Prefect orchestration 26 | def test_add_and_square_sysout_flow(number_data, capsys): 27 | _ = add_and_square.fn(number_data["a"], number_data["b"]) # Run flow logic directly 28 | captured = capsys.readouterr() # Capture print output 29 | assert str(number_data["expected_sum_square"]) in captured.out, "Flow output does not match expected value" 30 | -------------------------------------------------------------------------------- /tests/test_s02b_data_processing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import patch, MagicMock 3 | from solution.s02b_sentiment_pipeline_v2 import ( 4 | lowercase_text, 5 | strip_url, 6 | strip_user, 7 | replace_emoji, 8 | lemmatize_text, 9 | process_text, 10 | ) 11 | 12 | @pytest.mark.parametrize("input_text, expected_output", [ 13 | ("Hello World!", "hello world!"), 14 | (" Whitespace ", "whitespace"), 15 | ("MIXED CASE", "mixed case") 16 | ]) 17 | def test_lowercase_text(input_text, expected_output): 18 | assert lowercase_text.fn(input_text) == expected_output 19 | 20 | @pytest.mark.parametrize("input_text, expected_output", [ 21 | ("Check out this link: https://example.com", "Check out this link: WEBADDRESS"), 22 | ("Visit www.example.org for more info", "Visit WEBADDRESS for more info") 23 | ]) 24 | def test_strip_url(input_text, expected_output): 25 | assert strip_url.fn(input_text) == expected_output 26 | 27 | @pytest.mark.parametrize("input_text, expected_output", [ 28 | ("Hello @user!", "Hello USERHANDLE"), 29 | ("@JohnDoe mentioned me", "USERHANDLE mentioned me") 30 | ]) 31 | def test_strip_user(input_text, expected_output): 32 | assert strip_user.fn(input_text) == expected_output 33 | 34 | @pytest.mark.parametrize("input_text, expected_output", [ 35 | (":) :-D", "smile EMOJI smile EMOJI"), 36 | (":(", "sad EMOJI") 37 | ]) 38 | def test_replace_emoji(input_text, expected_output): 39 | assert replace_emoji.fn(input_text) == expected_output 40 | 41 | @pytest.mark.parametrize("input_text, expected_output", [ 42 | ("running running running", "running running running"), 43 | ("better better best", "better better best") 44 | ]) 45 | def test_lemmatize_text(input_text, expected_output): 46 | assert lemmatize_text.fn(input_text) == expected_output 47 | 48 | 49 | # Testing the flow 50 | @pytest.mark.parametrize("input_text, expected_output", [ 51 | ("Hello :) https://example.com @user", "hello smile EMOJI WEBADDRESS USERHANDLE"), 52 | ("No URLs or handles", "no url or handle") 53 | ]) 54 | @patch("solution.s02b_sentiment_pipeline_v2.get_run_logger") 55 | def test_process_text(mock_logger, input_text, expected_output): 56 | mock_logger.return_value = MagicMock() # Mock the logger to avoid logging issues 57 | result = process_text.fn(input_text) 58 | assert result == expected_output -------------------------------------------------------------------------------- /tests/test_s02b_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import patch, MagicMock 3 | from solution.s02b_sentiment_pipeline_v2 import calc_sentiment, sentiment_analysis 4 | 5 | @pytest.mark.parametrize("input_text, expected_sentiment", [ 6 | ("I love this!", 0.625), 7 | ("I hate this!", -1.0) 8 | ]) 9 | def test_calc_sentiment(input_text, expected_sentiment): 10 | sentiment = calc_sentiment(input_text) 11 | assert abs(sentiment - expected_sentiment) < 0.01 12 | 13 | @pytest.mark.parametrize("input_text, expected_sentiment", [ 14 | ("I love this!", 0.625), 15 | ("I hate this!", -1.0) 16 | ]) 17 | @patch("solution.s02b_sentiment_pipeline_v2.get_run_logger") 18 | def test_sentiment_analysis(mock_logger, input_text, expected_sentiment): 19 | mock_logger.return_value = MagicMock() # Mock the logger to avoid logging issues 20 | sentiment = sentiment_analysis(input_text) 21 | assert abs(sentiment - expected_sentiment) < 0.01 # Allow a small tolerance 22 | 23 | --------------------------------------------------------------------------------