├── .gitignore
├── README.md
├── notebooks
    ├── Compare_APIs.ipynb
    ├── Compare_Timing_Logs.ipynb
    ├── GenGlobalTrust_indexed.ipynb
    ├── GenLocalTrust.ipynb
    ├── GenPersonalGraph.ipynb
    ├── README.md
    ├── debug_prod_graph.ipynb
    ├── igraph-engagement_addr.ipynb
    ├── igraph-engagement_fid.ipynb
    ├── requirements.txt
    └── scripts_export
    │   └── GenPersonalGraph.py
├── pipeline
    ├── .env.sample
    ├── Dockerfile
    ├── README.md
    ├── casts
    │   ├── __init__.py
    │   ├── cast_db_utils.py
    │   ├── main.py
    │   ├── main_fetch_top_casters.py
    │   └── main_fetch_top_spammers.py
    ├── channels
    │   ├── Bot_Fids.csv
    │   ├── Seed_Fids.csv
    │   ├── Top_Channels.csv
    │   ├── Trending_Channels.csv
    │   ├── __init__.py
    │   ├── channel_db_utils.py
    │   ├── channel_queries.py
    │   ├── channel_utils.py
    │   ├── main.py
    │   ├── main_channel_rank.py
    │   ├── main_fetch_channel_top_casters.py
    │   ├── main_metrics.py
    │   ├── main_notify_daily_trending.py
    │   ├── main_notify_leaderboard.py
    │   ├── main_notify_weekly_mods.py
    │   ├── main_openrank.py
    │   ├── main_points.py
    │   ├── main_tokens.py
    │   └── openrank_utils.py
    ├── config.py
    ├── crontab.txt
    ├── cura_utils.py
    ├── dag_utils
    │   ├── clear_task_instance.py
    │   ├── combine_csv.py
    │   └── dune_backup.py
    ├── dags
    │   ├── archived
    │   │   ├── dag_automod.py
    │   │   ├── dag_backup_sandbox_db.py
    │   │   ├── dag_copy_graph_files_to_sandbox_dev_v1.py
    │   │   ├── dag_degen_tips_processing.py
    │   │   ├── dag_gen_personal_graph_replica_v0.py
    │   │   ├── dag_insert_degen_ranking_v0.py
    │   │   ├── dag_monitor_sandbox.py
    │   │   ├── dag_run_frame_pipeline_v0.py
    │   │   ├── degen
    │   │   │   ├── calculate_rank.py
    │   │   │   └── create_degen_sql_functions.py
    │   │   ├── extractors
    │   │   │   ├── dag_warpcast_channel_followers.py
    │   │   │   ├── dag_warpcast_channel_members.py
    │   │   │   └── dag_warpcast_channels.py
    │   │   └── sandbox
    │   │   │   ├── dag_sync_sandbox_casts.py
    │   │   │   ├── dag_sync_sandbox_channel_fids.py
    │   │   │   ├── dag_sync_sandbox_db_dev.py
    │   │   │   ├── dag_sync_sandbox_globaltrust.py
    │   │   │   └── dag_sync_sandbox_labels.py
    │   ├── cura
    │   │   ├── dag_direct_cast_join_requests.py
    │   │   ├── dag_run_autoinvite_rules.py
    │   │   └── dag_run_quote_casts.py
    │   ├── dag_backup_to_s3_v1.py
    │   ├── dag_copy_graph_files_to_replicas_v1.py
    │   ├── dag_gen_channel_openrank.py
    │   ├── dag_gen_channel_ranking_v3.py
    │   ├── dag_gen_channel_ranking_v4.py
    │   ├── dag_gen_globaltrust_v1.py
    │   ├── dag_gen_personal_graph_replica_v1.py
    │   ├── dag_notify_channel_daily_trending.py
    │   ├── dag_notify_channel_leaderboard.py
    │   ├── dag_notify_channel_weekly_mods.py
    │   ├── dag_refresh_rank_view_v0.py
    │   ├── dag_run_cast_pipeline_v0.py
    │   ├── dag_update_channel_points.py
    │   ├── dag_update_channel_tokens.py
    │   ├── extractors
    │   │   └── dag_cura_mod.py
    │   ├── monitoring
    │   │   ├── __init__.py
    │   │   ├── dag_monitor_nindexer.py
    │   │   └── dag_monitor_replication.py
    │   ├── one_off
    │   │   ├── .placeholder
    │   │   ├── dag_gen_globaltrust_by_date_v0.py
    │   │   ├── dag_gen_globaltrust_by_date_v1.py
    │   │   ├── dag_insert_to_dune_table.py
    │   │   ├── dag_migrate_dune_table.py
    │   │   ├── dag_trial_branch.py
    │   │   ├── dag_trial_sql.py
    │   │   ├── dag_trial_task_groups.py
    │   │   └── dag_trial_trigger.py
    │   ├── pg_to_dune
    │   │   ├── .env.sample
    │   │   ├── app
    │   │   │   └── check_last_timestamp.py
    │   │   └── upload_to_dune.sh
    │   ├── reports
    │   │   ├── dag_gen_channel_metrics.py
    │   │   └── dag_gen_labels.py
    │   └── triggers
    │   │   ├── trigger_gen_channel_ranking_v3.py
    │   │   └── trigger_gen_channel_ranking_v4.py
    ├── db_utils.py
    ├── docker-compose.yaml
    ├── extractors
    │   ├── automod_extractor.py
    │   ├── channel_extractor_utils.py
    │   ├── cura_mod_extractor.py
    │   ├── extract_channel_data.sh
    │   ├── extract_channel_fids.sh
    │   ├── extract_cura_mod.sh
    │   ├── main_channel_data.py
    │   └── main_channel_fids.py
    ├── frames
    │   ├── __init__.py
    │   ├── frames_db_utils.py
    │   ├── incremental_load_cast_mapping.sql
    │   ├── incremental_load_labels.sql
    │   ├── main.py
    │   ├── scrape_utils.py
    │   └── test_urls.py
    ├── globaltrust
    │   ├── __init__.py
    │   ├── compute.py
    │   ├── export_localtrust_daily_stats.sql
    │   ├── gen_globaltrust.py
    │   ├── queries.py
    │   └── test_data.py
    ├── go_eigentrust.py
    ├── graph
    │   ├── __init__.py
    │   ├── export_existingConnections_addr.sql
    │   ├── export_existingConnections_fid.sql
    │   ├── export_l1rep6rec3m12enhancedConnections_addr.sql
    │   ├── export_l1rep6rec3m12enhancedConnections_fid.sql
    │   ├── fetch_nodes_edges.py
    │   ├── gen_igraph.py
    │   ├── gen_personal_graph_amp.py
    │   ├── gen_personal_graph_amp_v1.py
    │   ├── graph_utils.py
    │   ├── rechunk_graph_pqt.py
    │   └── serve_igraph.py
    ├── igraph-docker-compose.yml
    ├── igraph.Dockerfile
    ├── igraph.nginx.conf
    ├── logs
    │   └── .placeholder
    ├── plugins
    │   ├── .placeholder
    │   ├── __init__.py
    │   └── hooks
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── discord.py
    │   │   └── pagerduty.py
    ├── requirements.txt
    ├── run_cast_pipeline.sh
    ├── run_channel_metrics.sh
    ├── run_channel_openrank.sh
    ├── run_channel_scraper_v3.sh
    ├── run_channel_scraper_v4.sh
    ├── run_download_pqt_files_v1.sh
    ├── run_eigen2_postgres_sql.sh
    ├── run_eigen8_postgres_sql.sh
    ├── run_fetch_channel_top_caster.sh
    ├── run_fetch_top_caster.sh
    ├── run_fetch_top_spammers.sh
    ├── run_frame_scraper.sh
    ├── run_globaltrust_pipeline.sh
    ├── run_graph_pipeline.sh
    ├── run_notify_channel_daily_trending.sh
    ├── run_notify_channel_leaderboard.sh
    ├── run_notify_channel_weekly_mods.sh
    ├── run_personal_graph_pipeline_v1.sh
    ├── run_update_channel_points.sh
    ├── run_update_channel_tokens.sh
    ├── samples
    │   ├── localtrust-engagement.csv
    │   ├── localtrust-following.csv
    │   └── pretrust.csv
    ├── schema
    │   ├── globaltrust_config.sql
    │   ├── k3l_objects.sql
    │   ├── k3l_schema.sql
    │   ├── neynar_db_schema.sql
    │   ├── pretrust_v2.sql
    │   └── replicator_db_schema.sql
    ├── scripts
    │   ├── archived
    │   │   ├── run_create_degen_db_functions.sh
    │   │   ├── run_personal_graph_pipeline.sh
    │   │   ├── run_sandbox_backup.sh
    │   │   └── run_urlextract_pipeline.sh
    │   └── one_off
    │   │   ├── diff_db_table.py
    │   │   ├── diff_json_api.py
    │   │   └── run_cast_pipeline_gapfills.sh
    ├── sshtunnel.Dockerfile
    ├── timer.py
    ├── tmp
    │   └── .placeholder
    └── utils.py
├── scripts
    ├── .placeholder
    └── certs
    │   └── graphcast_jobs
    │       ├── .env.sample
    │       ├── README.md
    │       ├── graph.cast.k3l.io
    │       ├── graph.castN.k3l.io
    │       ├── install_certs.sh
    │       └── push_certs.sh
├── serve
    ├── .dockerignore
    ├── .env.sample
    ├── .gitignore
    ├── .idea
    │   ├── .gitignore
    │   ├── .name
    │   ├── codeStyles
    │   │   └── codeStyleConfig.xml
    │   ├── dataSources.xml
    │   ├── data_source_mapping.xml
    │   ├── farcaster-graph-serve.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── sqldialects.xml
    │   ├── vcs.xml
    │   └── watcherTasks.xml
    ├── Dockerfile
    ├── README.md
    ├── app
    │   ├── __init__.py
    │   ├── config.py
    │   ├── dependencies
    │   │   ├── __init__.py
    │   │   ├── cache_db_utils.py
    │   │   ├── db_pool.py
    │   │   ├── db_utils.py
    │   │   ├── graph.py
    │   │   ├── logging.py
    │   │   └── memoize_utils.py
    │   ├── graph_loader.py
    │   ├── main.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── channel_model.py
    │   │   ├── feed_model.py
    │   │   ├── graph_model.py
    │   │   └── score_model.py
    │   ├── routers
    │   │   ├── __init__.py
    │   │   ├── cast_router.py
    │   │   ├── channel_router.py
    │   │   ├── direct_router.py
    │   │   ├── frame_router.py
    │   │   ├── globaltrust_router.py
    │   │   ├── graph_router.py
    │   │   ├── localtrust_router.py
    │   │   ├── metadata_router.py
    │   │   ├── token_router.py
    │   │   └── user_router.py
    │   ├── telemetry.py
    │   └── utils.py
    ├── docker-compose.yml
    ├── poetry.lock
    ├── pyproject.toml
    ├── samples
    │   ├── fc_90dv3_fid_SUCCESS
    │   ├── fc_90dv3_fid_df.pkl
    │   ├── fc_90dv3_fid_ig.pkl
    │   ├── fc_engagement_SUCCESS
    │   ├── fc_engagement_df.pkl
    │   ├── fc_engagement_fid_SUCCESS
    │   ├── fc_engagement_fid_df.pkl
    │   ├── fc_engagement_fid_ig.pkl
    │   ├── fc_engagement_idx.pkl
    │   ├── fc_engagement_ig.pkl
    │   ├── fc_following_SUCCESS
    │   ├── fc_following_df.pkl
    │   ├── fc_following_fid_SUCCESS
    │   ├── fc_following_fid_df.pkl
    │   ├── fc_following_fid_ig.pkl
    │   ├── fc_following_idx.pkl
    │   ├── fc_following_ig.pkl
    │   ├── fid_scores.json
    │   ├── lt_existingConnections_addr.csv
    │   ├── lt_existingConnections_fid.csv
    │   ├── lt_fboostedl1rep3rec6m12_90d_fid.csv
    │   ├── lt_l1rep6rec3m12enhancedConnections_addr.csv
    │   ├── lt_l1rep6rec3m12enhancedConnections_fid.csv
    │   └── personal_graph.parquet
    ├── scratchpad.md
    ├── scripts
    │   └── lint.sh
    └── static
    │   └── favicon.png
└── sql
    ├── counts_by_day.sql
    ├── counts_by_table.sql
    ├── k3l_requirements.sql
    ├── neynar-replica
        ├── .env.sample
        ├── Dockerfile
        ├── Dockerfile.alpine
        ├── Dockerfile.noble
        ├── docker-compose.yml
        ├── entrypoint.sh
        ├── pg_hba.conf
        ├── postgresql.conf
        └── postgresql.conf.orig
    ├── replicator_drop_fk.sql
    └── replicator_schema.sql


/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules/
 2 | dist/
 3 | .env
 4 | .env.docker
 5 | .*.credentials.json
 6 | build/
 7 | .venv
 8 | *.pyc
 9 | **/.ipynb_checkpoints
10 | **/.DS_Store
11 | **/lib/
12 | notebooks/data/
13 | **/pg_to_dune/csv
14 | pipeline/logs
15 | **/.vscode
16 | certificates
17 | **/tmp
18 | # Vim swap files
19 | .*.sw?
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Extract Graph-based insights from Farcaster
 2 | The project is broken into three sub-projects:
 3 | 
 4 | 1. `notebooks` - Jupyter notebooks for data exploration and prototyping graph queries.
 5 | 2. `pipeline` - python scripts to generate graphs and dataframes that can be used to serve graph-based queries.
 6 | 3. `serve` - FastAPI server to serve API requests for querying the graph from Farcaster.
 7 | 
 8 | __NOTE__ For details on how to deploy an individual sub-project, check out the Readme docs under that sub-project.
 9 | 
10 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Pre-requisites
2 | Assuming that you have Python and [pip](https://pip.pypa.io/en/stable/) installed on your system (maybe in a [virtualenv](https://docs.python.org/3/library/venv.html)), you need to `pip install -r requirements.txt`
3 | 
4 | # Exploring the Notebooks
5 | Run `jupyter notebook` and explore the notebooks in your default browser.


--------------------------------------------------------------------------------
/notebooks/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | igraph
4 | niquests
5 | ipython
6 | 


--------------------------------------------------------------------------------
/pipeline/.env.sample:
--------------------------------------------------------------------------------
 1 | DB_HOST="ip.address.or.host"
 2 | DB_PORT=5432
 3 | DB_USER="usually_postgres"
 4 | DB_NAME="db_name_like_lens_bigquery"
 5 | DB_PASSWORD="db_password"
 6 | 
 7 | REMOTE_DB_HOST="ip.address.or.host"
 8 | REMOTE_DB_PORT=9541
 9 | 
10 | TBL_CHANNEL_FIDS='DANGER_deletemefordefault_or_changeme'
11 | 
12 | PERSONAL_IGRAPH_INPUT='PATH_TO_IG_PKL'
13 | PERSONAL_IGRAPH_URL='CHANGE_THIS_URL'
14 | 
15 | IS_TEST='false'
16 | 
17 | AIRFLOW_UID=0
18 | AIRFLOW_GID=0
19 | AIRFLOW__CORE__FERNET_KEY='changeme'
20 | 
21 | SSH_KEY_PATH="changeme"
22 | DUNE_API_KEY="changeme"
23 | 
24 | # Safe Defaults
25 | POSTGRES_TIMEOUT_SECS=60
26 | 
27 | GO_EIGENTRUST_URL='http://localhost:8080'
28 | GO_EIGENTRUST_TIMEOUT_MS=600000
29 | GO_EIGENTRUST_BIND_SRC='/tmp'
30 | GO_EIGENTRUST_BIND_TARGET='/tmp'
31 | GO_EIGENTRUST_FILE_MODE='false'
32 | EIGENTRUST_ALPHA=0.5
33 | EIGENTRUST_EPSILON=1.0
34 | EIGENTRUST_MAX_ITER=50
35 | EIGENTRUST_FLAT_TAIL=2
36 | 
37 | FRAMES_NAP_SECS=10
38 | FRAMES_SLEEP_SECS=300
39 | FRAMES_BATCH_SIZE=1000
40 | FRAMES_SCRAPE_CONCURRENCY=10
41 | FRAMES_SCRAPE_CONNECT_TIMEOUT_SECS=5
42 | FRAMES_SCRAPE_READ_TIMEOUT_SECS=10
43 | 
44 | CASTS_SLEEP_SECS=10
45 | CASTS_BATCH_LIMIT=100000
46 | 
47 | WARPCAST_CHANNELS_TIMEOUT_SECS=5
48 | CHANNEL_SLEEP_SECS=1
49 | 
50 | 
51 | LOG_LEVEL='INFO'
52 | LOG_FORMAT='[%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(funcName)s ] %(message)s'
53 | LOGURU_FORMAT='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | {module}:{file}:{function}:{line} | {level} | <level>{message}</level>'
54 | LOG_PATH='/tmp/'
55 | 


--------------------------------------------------------------------------------
/pipeline/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:latest
 2 | # Switch to root to install additional packages
 3 | USER root
 4 | 
 5 | # Fix potential permission issues and update package list
 6 | RUN chmod -R a+rX /var/lib/apt/lists /var/cache/apt/archives && \
 7 |   apt-get clean && \
 8 |   rm -rf /var/lib/apt/lists/* && \
 9 |   mkdir -p /var/lib/apt/lists/partial && \
10 |   apt-get update && \
11 |   apt-get -y install zip
12 | 
13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
14 | RUN unzip awscliv2.zip
15 | 
16 | RUN ./aws/install
17 | 
18 | # Switch back to the airflow user
19 | USER airflow
20 | 
21 | # Set working directory
22 | WORKDIR /pipeline
23 | 
24 | # Copy only the necessary files for initial setup
25 | COPY requirements.txt /pipeline/requirements.txt
26 | COPY .env /pipeline/.env
27 | 
28 | # Source environment variables
29 | RUN /bin/bash -c "source /pipeline/.env"
30 | 
31 | RUN pip install --upgrade pip
32 | 
33 | RUN pip install -r /pipeline/requirements.txt
34 | RUN pip install apache-airflow-providers-pagerduty==3.7.2 discord-webhook==1.3.1 apache-airflow-providers-ssh==3.11.2
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Pre-requisites
 2 | 1. Install [psql](https://www.timescale.com/blog/how-to-install-psql-on-mac-ubuntu-debian-windows/) on your local machine.
 3 | 2. Run an instance of Postgres DB with data from Farcaster (installed locally or on a remote server)
 4 | 3. Install [Python 3.12](https://www.python.org/downloads/)
 5 | 4. Create a Python [virtualenv](https://docs.python.org/3/library/venv.html) somewhere on your machine - for example,`python3 -m venv .venv` will create a virtualenv in your current directory.
 6 | 5. Copy/rename the `.env.sample` file into `.env` and update the details of the Postgres DB from step 2 and the virutalenv from step 3.
 7 | 6. In case there is issues to create `.venv` add this code `rm -rf venv` `sudo apt install python3.12-venv`
 8 | 
 9 | # Run the pipeline
10 | `sh run_pipeline.sh -w . -o /tmp/fc_graph`
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/pipeline/casts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/casts/__init__.py


--------------------------------------------------------------------------------
/pipeline/casts/main_fetch_top_casters.py:
--------------------------------------------------------------------------------
 1 | # standard dependencies
 2 | import sys
 3 | from datetime import date
 4 | 
 5 | # local dependencies
 6 | from config import settings
 7 | import utils
 8 | from . import cast_db_utils
 9 | 
10 | # 3rd party dependencies
11 | from dotenv import load_dotenv
12 | from loguru import logger
13 | from sqlalchemy import create_engine
14 | 
15 | logger.remove()
16 | level_per_module = {
17 |     "": settings.LOG_LEVEL,
18 |     "silentlib": False
19 | }
20 | logger.add(sys.stdout,
21 |            colorize=True,
22 |            format=settings.LOGURU_FORMAT,
23 |            filter=level_per_module,
24 |            level=0)
25 | 
26 | def main():
27 |     pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
28 |     df = cast_db_utils.fetch_top_casters_df(logger, pg_dsn)
29 |     # top_casters = []
30 |     # for caster in casters:
31 |     #     top_casters.append({'i': caster['i'], 'v': caster['v']})
32 | 
33 |     # df = pd.DataFrame(data=top_casters)
34 |     df["date_iso"] = date.today()
35 |     logger.info(utils.df_info_to_string(df, with_sample=True))
36 | 
37 |     postgres_engine = create_engine(
38 |         settings.ALT_POSTGRES_URL.get_secret_value(),
39 |         connect_args={"connect_timeout": settings.POSTGRES_TIMEOUT_SECS * 1_000},
40 |     )
41 |     logger.info(postgres_engine)
42 |     with postgres_engine.connect() as connection:
43 |         df.to_sql('k3l_top_casters', con=connection, if_exists='append', index=False)
44 | 
45 |     # cast_db_utils.insert_dune_table(settings.DUNE_API_KEY, 'openrank', 'top_caster', df)
46 | 
47 |     logger.info('top casters data updated to DB')
48 | 
49 |     # end while loop
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     load_dotenv()
54 |     print(settings)
55 | 
56 |     # parser = argparse.ArgumentParser(description='Fetch top casters, persist the dataframe to db')
57 |     #
58 |     # parser.add_argument('-u', '--user')
59 |     # parser.add_argument('-p', '--password')
60 |     # parser.add_argument('-e', '--endpoint')
61 |     #
62 |     # args = parser.parse_args()
63 | 
64 |     logger.info('hello hello')
65 |     main()
66 | 


--------------------------------------------------------------------------------
/pipeline/channels/Bot_Fids.csv:
--------------------------------------------------------------------------------
1 | FID,Username
2 | 262301,roundsbot
3 | 862591,cura-bot
4 | 864314,curabot
5 | 396644,hyperbot
6 | 861203,modbot
7 | 368422,automod
8 | 364927,paybot


--------------------------------------------------------------------------------
/pipeline/channels/Seed_Fids.csv:
--------------------------------------------------------------------------------
1 | channel id,Seed Peers FIDs
2 | superrare,"9480,9480, 190045, 12299, 346769, 374498, 513681, 270678, 368422,12299, 190045, 270678, 346769, 374498, 513681, 9480"
3 | build,"8446, 195255, 221216, 6730, 9856, 4461, 1214, 9816, 15732, 399485, 16085, 14351, 99"
4 | memes,"576, 3, 2, 3621, 239, 457, 347, 557, 4407, 1287, 1325"
5 | dev,"191, 6841"
6 | louder,"238853,15696, 206, 403020, 395131, 508334, 477292"
7 | wildcardclub,"4914, 7791"
8 | 


--------------------------------------------------------------------------------
/pipeline/channels/Trending_Channels.csv:
--------------------------------------------------------------------------------
 1 | ChannelID
 2 | zora
 3 | farcaster
 4 | itookaphoto
 5 | memes
 6 | replyguys
 7 | farville
 8 | degen
 9 | nature
10 | sense
11 | food
12 | jobs
13 | lifeisgood
14 | anime-manga
15 | football
16 | higher
17 | dickbutt
18 | art
19 | talent
20 | brypto
21 | dickbutt
22 | six
23 | vibely
24 | screens
25 | nba


--------------------------------------------------------------------------------
/pipeline/channels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/channels/__init__.py


--------------------------------------------------------------------------------
/pipeline/channels/main_metrics.py:
--------------------------------------------------------------------------------
 1 | # standard dependencies
 2 | import sys
 3 | import argparse
 4 | import datetime
 5 | 
 6 | # local dependencies
 7 | from config import settings
 8 | from . import channel_db_utils
 9 | from .channel_db_utils import Metric
10 | 
11 | # 3rd party dependencies
12 | from dotenv import load_dotenv
13 | from loguru import logger
14 | 
15 | # Configure logger
16 | logger.remove()
17 | level_per_module = {
18 |     "": settings.LOG_LEVEL,
19 |     "silentlib": False
20 | }
21 | logger.add(sys.stdout,
22 |            colorize=True,
23 |            format=settings.LOGURU_FORMAT,
24 |            filter=level_per_module,
25 |            level=0)
26 | 
27 | load_dotenv()
28 | 
29 | def main():
30 |     # Metrics only available in Eigen 8
31 |     pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
32 |     sql_timeout_ms = 120_000
33 |     channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_NUM_CASTS)
34 |     channel_db_utils.upsert_weekly_metrics(logger, pg_dsn, sql_timeout_ms, Metric.WEEKLY_UNIQUE_CASTERS)
35 | 
36 | if __name__ == "__main__":
37 | 
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument(
40 |         "--run",
41 |         action="store_true",
42 |         help="dummy arg to prevent accidental execution",
43 |         required=True
44 |     )
45 |     parser.add_argument(
46 |         "--dry-run",
47 |         help="indicate dry-run mode",
48 |         action="store_true"
49 |     ) 
50 |     args = parser.parse_args()
51 |     print(args)
52 |     logger.info(settings)
53 |     
54 |     if args.dry_run:
55 |         settings.IS_TEST = True
56 | 
57 |     main()


--------------------------------------------------------------------------------
/pipeline/channels/openrank_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import subprocess
 3 | import os
 4 | import tempfile
 5 | 
 6 | from config import settings
 7 | 
 8 | from loguru import logger
 9 | 
10 | def download_results(req_id: str, toml_file: Path, out_dir:Path, out_file: Path):
11 |     new_env = os.environ.copy()
12 |     new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value()
13 |     get_cmd = subprocess.run(
14 |         ["openrank-sdk", "get-results", str(req_id), str(toml_file), str(out_file)],
15 |         stdout=subprocess.DEVNULL,
16 |         stderr=subprocess.PIPE,
17 |         text=True,
18 |         timeout=settings.OPENRANK_TIMEOUT_SECS,
19 |         env=new_env,
20 |         check=True,
21 |     )
22 |     if get_cmd.returncode != 0:   
23 |         logger.error(f"OpenRank get-results failed for {req_id}: {get_cmd.stderr}")
24 |         raise Exception("OpenRank get-results failed")
25 |     logger.info(f"OpenRank get-results for {req_id} downloaded to: {out_file}")
26 | 
27 | def update_and_compute(lt_file: Path, pt_file: Path, toml_file: Path) -> str:
28 |     new_env = os.environ.copy()
29 |     new_env['SECRET_KEY'] = settings.OPENRANK_REQ_SECRET_KEY.get_secret_value()
30 | 
31 |     lt_cmd = subprocess.run(
32 |         ["openrank-sdk", "trust-update", str(lt_file), str(toml_file)],
33 |         stdout=subprocess.PIPE,
34 |         stderr=subprocess.STDOUT,
35 |         text=True,
36 |         # check=True, # we don't want to throw error until we have a chance to print the output
37 |         timeout=settings.OPENRANK_TIMEOUT_SECS,
38 |         env=new_env,
39 |     )
40 |     logger.info(f"OpenRank trust-update output: {lt_cmd}")
41 |     if lt_cmd.returncode != 0:
42 |         logger.error(f"OpenRank trust-update failed: {lt_cmd.stdout}")
43 |         raise Exception("OpenRank trust-update failed")
44 |     pt_cmd = subprocess.run(
45 |         ["openrank-sdk", "seed-update", str(pt_file), str(toml_file)],
46 |         stdout=subprocess.PIPE,
47 |         stderr=subprocess.STDOUT,
48 |         text=True,
49 |         timeout=settings.OPENRANK_TIMEOUT_SECS,
50 |         env=new_env,
51 |     )
52 |     logger.info(f"OpenRank seed-update output: {pt_cmd}")
53 |     if pt_cmd.returncode != 0:
54 |         logger.error(f"OpenRank seed-update failed: {pt_cmd.stdout}")
55 |         raise Exception("OpenRank seed-update failed")
56 |     compute_cmd = subprocess.run(
57 |         ["openrank-sdk", "compute-request", str(toml_file)],
58 |         stdout=subprocess.PIPE,
59 |         stderr=subprocess.STDOUT,
60 |         text=True,
61 |         timeout=settings.OPENRANK_TIMEOUT_SECS,
62 |         env=new_env,
63 |     )
64 |     logger.info(f"OpenRank compute output: {compute_cmd}")
65 |     if compute_cmd.returncode != 0:
66 |         logger.error(f"OpenRank compute failed: {compute_cmd.stdout}")
67 |         raise Exception("OpenRank compute failed")
68 |     req_id = compute_cmd.stdout.strip()
69 |     logger.info(f"OpenRank request id: {req_id}")
70 |     return req_id
71 | 


--------------------------------------------------------------------------------
/pipeline/dag_utils/clear_task_instance.py:
--------------------------------------------------------------------------------
 1 | from airflow import settings
 2 | from airflow.models import DagRun, TaskInstance
 3 | from airflow.utils.state import State
 4 | 
 5 | # Define your variables
 6 | dag_id = "gen_personal_graph_replica_v1"
 7 | task_id = "process_channel_chunk"
 8 | run_id = "manual__2024-07-22T06:46:15.813325+00:00"
 9 | map_index_start = 908 # 908 430
10 | map_index_end = 939 # 939 907
11 | 
12 | # Get the session
13 | session = settings.Session()
14 | 
15 | # Query the DagRun
16 | dag_run = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).one()
17 | 
18 | # Loop through the range of map indexes and clear each task instance
19 | for map_index in range(map_index_start, map_index_end + 1):
20 |     try:
21 |         # Query the TaskInstance
22 |         task_instance = session.query(TaskInstance).filter(
23 |             TaskInstance.dag_id == dag_id,
24 |             TaskInstance.task_id == task_id,
25 |             TaskInstance.run_id == run_id,
26 |             TaskInstance.map_index == map_index
27 |         ).one()
28 | 
29 |         # Clear the task instance
30 |         task_instance.set_state(State.SUCCESS, session=session)
31 |         print(f"Cleared task {task_id} with map index {map_index} for DAG {dag_id} and run ID {run_id}")
32 |     except Exception as e:
33 |         print(f"Could not clear task {task_id} with map index {map_index}: {e}")
34 | 
35 | # Commit the changes
36 | session.commit()
37 | print(f"Cleared tasks {task_id} with map indexes from {map_index_start} to {map_index_end} for DAG {dag_id} and run ID {run_id}")
38 | 
39 | 


--------------------------------------------------------------------------------
/pipeline/dag_utils/combine_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import re
 4 | 
 5 | # Specify the directory containing the CSV files
 6 | directory = 'backup/'
 7 | 
 8 | # Specify the output file
 9 | output_file = 'combined_dataset.csv'
10 | 
11 | # Function to extract numeric offset from filename
12 | def extract_offset(filename):
13 |     match = re.search(r'offset_(\d+)', filename)
14 |     return int(match.group(1)) if match else 0
15 | 
16 | # Get list of files sorted by numeric offset
17 | files = sorted(
18 |     (f for f in os.listdir(directory) if f.startswith('karma3-labs.dataset_k3l_cast_localtrust_offset_') and f.endswith('.csv')),
19 |     key=extract_offset
20 | )
21 | 
22 | # Initialize a flag to handle headers
23 | header_saved = False
24 | 
25 | # Open the output file in write mode
26 | with open(output_file, 'w', newline='') as outfile:
27 |     csv_writer = csv.writer(outfile)
28 | 
29 |     # Iterate over each sorted file
30 |     for filename in files:
31 |         file_path = os.path.join(directory, filename)
32 | 
33 |         # Open each CSV file in read mode
34 |         with open(file_path, 'r') as infile:
35 |             csv_reader = csv.reader(infile)
36 | 
37 |             # Iterate over the rows in the input file
38 |             for i, row in enumerate(csv_reader):
39 |                 # Write the header only once
40 |                 if i == 0:
41 |                     if not header_saved:
42 |                         csv_writer.writerow(row)
43 |                         header_saved = True
44 |                 else:
45 |                     # Skip empty rows
46 |                     if any(cell.strip() for cell in row):
47 |                         csv_writer.writerow(row)
48 | 
49 | print(f'Combined CSV file saved as {output_file}')


--------------------------------------------------------------------------------
/pipeline/dag_utils/dune_backup.py:
--------------------------------------------------------------------------------
 1 | import urllib3
 2 | from concurrent.futures import ThreadPoolExecutor, as_completed
 3 | 
 4 | import time
 5 | import random
 6 | 
 7 | http = urllib3.PoolManager()
 8 | 
 9 | def download_csv(limit: int, offset: int, table_name: str):
10 |   """
11 |   Download CSV data from the backend server.
12 | 
13 |   Args:
14 |       endpoint (str): The endpoint for the download.
15 | 
16 |   Returns:
17 |       List[dict]: List of downloaded data.
18 | 
19 |   Example:
20 |       data = et._download_csv('localtrust/123')
21 |   """
22 |   print(f'limt={limit}, offset={offset}')
23 |   jitter = random.uniform(0.01, 1)
24 |   time.sleep(jitter)
25 | 
26 |   response = http.request(
27 |       'GET',
28 |       f'https://api.dune.com/api/v1/query/3832819/results/csv?limit={limit}&offset={offset}',
29 |       headers={
30 |             'Accept': 'text/csv',
31 |               'Content-Type':'text/csv',
32 |               'X-DUNE-API-KEY': '7QYqrqNvGVJJuwMybzxfh1sbR8qXFbDI',
33 |               },
34 |     preload_content=False
35 |     )
36 |   if response.status != 200:
37 |     raise Exception(f"Failed to download CSV: {response.data.decode('utf-8')}")
38 | 
39 |   # data = response.data.decode('utf-8')
40 |   # print(data)
41 |   filename = f'backup/{table_name}_offset_{offset}_limit_{limit}.csv'
42 |   with open(filename, 'wb') as out_file:
43 |     # print(data)
44 |     # data = response.read() # a `bytes` object
45 |     out_file.write(response.data)
46 | 
47 |     # shutil.copyfileobj(response, out_file)
48 |     # out_file.write(response)
49 |     print(f'wrote {filename}')
50 | 
51 | 
52 | 
53 | limit = 30000
54 | # next = limit
55 | offset = 0
56 | 
57 | start = 0
58 | stop = 382500000
59 | step = limit
60 | incremental_array = list(range(start, stop + step, step))
61 | 
62 | # print(incremental_array[:100])
63 | num_workers = 25
64 | table_name = "karma3-labs.dataset_k3l_cast_localtrust"
65 | # Use ThreadPoolExecutor to make parallel HTTP requests
66 | with ThreadPoolExecutor(max_workers=num_workers) as executor:
67 |     future_to_value = {executor.submit(download_csv, limit, value, table_name): value for value in incremental_array}
68 | 
69 |     for future in as_completed(future_to_value):
70 |         value = future_to_value[future]
71 |         try:
72 |             future.result()
73 |         except Exception as exc:
74 |             print(f'Value {value} generated an exception: {exc}')
75 | 
76 | print("All requests completed.")


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_automod.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash_operator import BashOperator
 3 | from airflow.models import Variable
 4 | from hooks.discord import send_alert_discord
 5 | from hooks.pagerduty import send_alert_pagerduty
 6 | from datetime import datetime, timedelta
 7 | 
 8 | 
 9 | api_key = Variable.get("API_KEY", default_var="api_key")
10 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test")
11 | db_user = Variable.get('DB_USER', default_var="test")
12 | db_password = Variable.get('DB_PASSWORD', default_var="test")
13 | 
14 | 
15 | default_args = {
16 |     'owner': 'coder2j',
17 |     'retries': 1,
18 |     'retry_delay': timedelta(minutes=5)
19 | }
20 | 
21 | with DAG(
22 |         'extract_automod_api_to_db',
23 |         default_args=default_args,
24 |         description='Fetch data from AUTOMOD API and load into DB daily',
25 |         # schedule_interval=timedelta(days=1),
26 |         schedule_interval=None,
27 |         start_date=datetime(2024, 9, 4),
28 |         is_paused_upon_creation=True,
29 |         max_active_runs=1,
30 |         catchup=False,
31 | ) as dag:
32 |     fetch_data_from_automod = BashOperator(
33 |         task_id='fetch_automod_data_from_api',
34 |         bash_command=f"cd /pipeline/extractors ; python3 automod_extractor.py {api_key} { db_user } { db_password } { db_endpoint }"
35 |     )
36 | 
37 |     fetch_data_from_automod
38 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_backup_sandbox_db.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | from airflow.operators.bash import BashOperator
 8 | 
 9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 | 
12 | default_args = {
13 |     'owner': 'coder2j',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 | 
19 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
20 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
21 | 
22 | with DAG(
23 |     dag_id='dag_backup_sandbox_db_v0',
24 |     default_args=default_args,
25 |     description='sync the db table of the sandboxed read replica',
26 |     start_date=datetime(2024, 8, 10, 18),
27 |     # schedule_interval='0 0 * * *', # backup everyday
28 |     schedule_interval=None, # backup everyday
29 |     catchup=False,
30 | ) as dag:
31 | 
32 | 
33 |     # ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
34 | 
35 |     # run_sandbox_backup = SSHOperator(
36 |     #     task_id="run_sandbox_backup_v0",
37 |     #     command=f"cd {sandbox_db_sync_path}; ./run-backup.sh ",
38 |     #     ssh_hook=ssh_hook,
39 |     #     dag=dag)
40 | 
41 |     run_sandbox_backup = BashOperator(
42 |         task_id='run_sandbox_backup',
43 |         bash_command="cd /pipeline && ./run_sandbox_backup.sh "
44 |     )
45 | 
46 |     run_sandbox_backup
47 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_copy_graph_files_to_sandbox_dev_v1.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.providers.ssh.operators.ssh import SSHHook
 7 | from airflow.providers.ssh.operators.ssh import SSHOperator
 8 | from airflow.sensors.external_task import ExternalTaskSensor
 9 | 
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 | 
13 | default_args = {
14 |     "owner": "coder2j",
15 |     "retries": 5,
16 |     "retry_delay": timedelta(minutes=2),
17 |     "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 | 
20 | dev_sandbox_pipeline_path = Variable.get("dev_sandbox_pipeline_path")
21 | data_backup_s3_bucket = Variable.get("data_backup_s3_bucket")
22 | 
23 | with DAG(
24 |     dag_id="copy_graph_files_to_sandbox_dev_v2",
25 |     default_args=default_args,
26 |     description="re-generate graph for farcaster-graph API server. copy re-generated all graph files to dev sandbox from backup s3",
27 |     start_date=datetime(2024, 7, 9, 18),
28 |     # schedule_interval="0 0 * * *",
29 |     schedule_interval=None,
30 |     is_paused_upon_creation=True,
31 |     max_active_runs=1,
32 |     catchup=False,
33 | ) as dag:
34 | 
35 |     ssh_hook = SSHHook(ssh_conn_id='sandbox_staging', keepalive_interval=60, cmd_timeout=None)
36 | 
37 |     download_pqt_file = SSHOperator(
38 |         task_id="download_pqt_file_v1",
39 |         command=f"cd {dev_sandbox_pipeline_path}; ./run_graph_pipeline.sh -o /data/serve_files -s {data_backup_s3_bucket} ",
40 |         ssh_hook=ssh_hook,
41 |         dag=dag,
42 |     )
43 | 
44 |     download_pqt_file


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_degen_tips_processing.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.bash import BashOperator
 4 | from airflow.operators.python import PythonOperator
 5 | from hooks.discord import send_alert_discord
 6 | from hooks.pagerduty import send_alert_pagerduty
 7 | 
 8 | default_args = {
 9 |     'owner': 'coder2j',
10 |     'retries': 5,
11 |     'retry_delay': timedelta(minutes=2),
12 |     # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
13 | }
14 | 
15 | with DAG(
16 |     dag_id='dag_degen_tips_processing_v0',
17 |     default_args=default_args,
18 |     description='Process DEGEN tips from casts',
19 |     start_date=datetime(2024, 7, 9, 18),
20 |     # schedule_interval='*/10 * * * *',  # Run every 10 minutes
21 |     schedule_interval=None, 
22 |     catchup=False,
23 | ) as dag:
24 |     task_update_degen_tips = BashOperator(
25 |         task_id='update_degen_tips_v0',
26 |         bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t extract
27 |         '''
28 |     )
29 | 
30 |     task_analyze_degen_tips = BashOperator(
31 |         task_id='analyze_degen_tips_v0',
32 |         bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
33 |         ANALYZE k3l_degen_tips;
34 |         ANALYZE k3l_cast_action;"
35 |         '''
36 |     )
37 | 
38 |     # Set up the task dependencies
39 |     task_update_degen_tips >> task_analyze_degen_tips


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_gen_personal_graph_replica_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | with DAG(
19 |     dag_id='gen_personal_graph_replica_v0',
20 |     default_args=default_args,
21 |     description='Every hour, try running personal graph script on eigen7 replica. Script has internal check for 36 hours',
22 |     start_date=datetime(2024, 7, 9, 18),
23 |     # schedule_interval='0 * * * *',
24 |     schedule_interval=None,
25 |     catchup=False,
26 | ) as dag:
27 |     ssh_hook = SSHHook(ssh_conn_id='eigen7', keepalive_interval=60, cmd_timeout=None)
28 | 
29 |     eigen7_copy_localtrust_csv_files = SSHOperator(
30 |         task_id="eigen7_gen_personal_graph",
31 |         command=f"cd ~/farcaster-graph/pipeline; ./run_personal_graph_pipeline.sh -i ~/serve_files/lt_l1rep6rec3m12enhancedConnections_fid.csv -o ~/wip_files/ -w . -v .venv -s k3l-openrank-farcaster -l /var/log/farcaster-graph/ ",
32 |         ssh_hook=ssh_hook,
33 |         dag=dag)
34 | 
35 |     eigen7_copy_localtrust_csv_files
36 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_insert_degen_ranking_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.bash import BashOperator
 4 | from airflow.operators.python import PythonOperator
 5 | from hooks.discord import send_alert_discord
 6 | from hooks.pagerduty import send_alert_pagerduty
 7 | 
 8 | default_args = {
 9 |     'owner': 'coder2j',
10 |     'retries': 5,
11 |     'retry_delay': timedelta(minutes=2),
12 |     # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
13 | }
14 | 
15 | with DAG(
16 |     dag_id='dag_degen_insert_ranking_v0',
17 |     default_args=default_args,
18 |     description='Process DEGEN tips from casts',
19 |     start_date=datetime(2024, 7, 9, 18),
20 |     # schedule_interval='10 */6 * * *',
21 |     schedule_interval=None,
22 |     catchup=False,
23 | ) as dag:
24 | 
25 |     task_update_degen_tips = BashOperator(
26 |         task_id='update_degen_tips_v0',
27 |         bash_command='''cd /pipeline/ && ./run_create_degen_db_functions.sh -v .venv -t insert_scores
28 |         '''
29 |     )
30 | 
31 |     task_analyze_degen_tips = BashOperator(
32 |         task_id='analyze_degen_tips_v0',
33 |         bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
34 |         ANALYZE k3l_degen_tips;
35 |         ANALYZE k3l_cast_action;"
36 |         '''
37 |     )
38 | 
39 |     # Set up the task dependencies
40 |     task_update_degen_tips >> task_analyze_degen_tips


--------------------------------------------------------------------------------
/pipeline/dags/archived/dag_run_frame_pipeline_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | from hooks.discord import send_alert_discord
 7 | from hooks.pagerduty import send_alert_pagerduty
 8 | 
 9 | default_args = {
10 |     'owner': 'coder2j',
11 |     'retries': 5,
12 |     'retry_delay': timedelta(minutes=2),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | with DAG(
17 |     dag_id='extract_frame_url_v0',
18 |     default_args=default_args,
19 |     description='Extract urls from cast embeds for frames and refresh pg statistics',
20 |     start_date=datetime(2024, 7, 9, 18),
21 |     # schedule_interval='1-59/20 * * * *',
22 |     # Decommission Frames ranking due to lack of usage
23 |     # ... and relevance with the introduction of Frames V2 by Warpcast
24 |     # schedule_interval=timedelta(minutes=20),
25 |     schedule_interval=None,
26 |     is_paused_upon_creation=True,
27 |     max_active_runs=1,
28 |     catchup=False,
29 | ) as dag:
30 |     task1 = BashOperator(
31 |         task_id='run_urlextract_pipeline',
32 |         bash_command='cd /pipeline/ && ./run_urlextract_pipeline.sh -w . '
33 |     )
34 | 
35 |     task2 = BashOperator(
36 |         task_id='run_frame_scraper',
37 |         bash_command='cd /pipeline/ && ./run_frame_scraper.sh -v ./.venv/ '
38 |     )
39 | 
40 |     task3 = BashOperator(
41 |         task_id='analyze_url_labels_and_mapping',
42 |         bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
43 |         ANALYZE k3l_url_labels; ANALYZE k3l_cast_embed_url_mapping;"
44 |         '''
45 |     )
46 | 
47 |     task4 = BashOperator(
48 |         task_id='refresh_k3l_frame_interaction',
49 |         bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
50 |         REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_frame_interaction;"
51 |         '''
52 |     )
53 | 
54 |     # task5 = BashOperator(
55 |     #     task_id='vacuum_k3l_frame_interaction',
56 |     #     bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
57 |     #     VACUUM ANALYZE k3l_recent_frame_interaction;"
58 |     #     '''
59 |     # )
60 | 
61 |     # task1 >> task2 >> task3 >> task4 >> task5
62 |     task1 >> task2 >> task3 >> task4
63 | 
64 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channel_followers.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | from hooks.discord import send_alert_discord
 7 | from hooks.pagerduty import send_alert_pagerduty
 8 | 
 9 | default_args = {
10 |     "owner": "karma3labs",
11 |     "retries": 1,
12 |     "retry_delay": timedelta(minutes=5),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | with DAG(
17 |     "extract_warpcast_followers",
18 |     default_args=default_args,
19 |     description="Fetch channel followers from WARPCAST API and load into DB daily",
20 |     schedule_interval=timedelta(days=1),
21 |     start_date=datetime(2024, 8, 1),
22 |     is_paused_upon_creation=True,
23 |     max_active_runs=1,
24 |     catchup=False,
25 | ) as dag:
26 |     
27 |     prep_task = BashOperator(
28 |         task_id='prep_warpcast_followers',
29 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep" 
30 |                         " -w . -v .venv -j followers",
31 |         dag=dag
32 |     )
33 | 
34 |     fetch_task = BashOperator(
35 |         task_id='extract_channel_followers',
36 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch" 
37 |                         " -w . -v .venv -c channels/Top_Channels.csv -s top -j followers",
38 |         dag=dag
39 |     )
40 | 
41 |     cleanup_task = BashOperator(
42 |         task_id='cleanup_warpcast_followers',
43 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup" 
44 |                         " -w . -v .venv -j followers",
45 |         dag=dag
46 |     )
47 | 
48 |     prep_task >> fetch_task >> cleanup_task


--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channel_members.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | from hooks.discord import send_alert_discord
 7 | from hooks.pagerduty import send_alert_pagerduty
 8 | 
 9 | default_args = {
10 |     "owner": "karma3labs",
11 |     "retries": 1,
12 |     "retry_delay": timedelta(minutes=5),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | with DAG(
17 |     "extract_warpcast_members",
18 |     default_args=default_args,
19 |     description="Fetch channel members from WARPCAST API and load into DB daily",
20 |     schedule_interval=timedelta(hours=1),
21 |     start_date=datetime(2024, 8, 1),
22 |     is_paused_upon_creation=True,
23 |     max_active_runs=1,
24 |     catchup=False,
25 | ) as dag:
26 |  
27 |     prep_task = BashOperator(
28 |         task_id='prep_warpcast_members',
29 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t prep" 
30 |                         " -w . -v .venv -j members",
31 |         dag=dag
32 |     )
33 | 
34 |     fetch_task = BashOperator(
35 |         task_id='fetch_warpcast_members',
36 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t fetch" 
37 |                         " -w . -v .venv -c channels/Top_Channels.csv -s top -j members",
38 |         dag=dag
39 |     )
40 | 
41 |     cleanup_task = BashOperator(
42 |         task_id='cleanup_warpcast_members',
43 |         bash_command="cd /pipeline; extractors/extract_channel_fids.sh -t cleanup" 
44 |                         " -w . -v .venv -j members",
45 |         dag=dag
46 |     )
47 | 
48 |     prep_task >> fetch_task >> cleanup_task


--------------------------------------------------------------------------------
/pipeline/dags/archived/extractors/dag_warpcast_channels.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash_operator import BashOperator
 3 | from airflow.models import Variable
 4 | from hooks.discord import send_alert_discord
 5 | from hooks.pagerduty import send_alert_pagerduty
 6 | from datetime import datetime, timedelta
 7 | 
 8 | db_endpoint = Variable.get('DB_ENDPOINT', default_var="test")
 9 | db_user = Variable.get('DB_USER', default_var="test")
10 | db_password = Variable.get('DB_PASSWORD', default_var="test")
11 | 
12 | 
13 | default_args = {
14 |     'owner': 'coder2j',
15 |     'retries': 1,
16 |     'retry_delay': timedelta(minutes=5)
17 | }
18 | 
19 | with DAG(
20 |         'extract_warpcast_channels',
21 |         default_args=default_args,
22 |         description='Fetch channels metadata from WARPCAST API and load into DB daily',
23 |         schedule_interval=timedelta(days=1),
24 |         start_date=datetime(2024, 8, 19),
25 |         is_paused_upon_creation=True,
26 |         max_active_runs=1,
27 |         catchup=False,
28 | ) as dag:
29 |     fetch_data_from_warpcast = BashOperator(
30 |         task_id='fetch_warpcast_data_from_api',
31 |         bash_command="cd /pipeline; extractors/extract_channel_data.sh" 
32 |                         " -w . -v .venv ",
33 |         dag=dag
34 |     )
35 | 
36 |     fetch_data_from_warpcast
37 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_casts.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
20 | 
21 | with DAG(
22 |     dag_id='sync_sandbox_db_casts',
23 |     default_args=default_args,
24 |     description='sync cast actions and parent casts to the sandbox',
25 |     start_date=datetime(2024, 7, 10, 18),
26 |     # schedule_interval='*/10 * * * *',
27 |     # schedule_interval=timedelta(minutes=5),
28 |     schedule=None,
29 |     is_paused_upon_creation=True,
30 |     max_active_runs=1,
31 |     catchup=False,
32 | ) as dag:
33 |     ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
34 | 
35 |     run_append = SSHOperator(
36 |         task_id="run_append_v1",
37 |         command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh ",
38 |         ssh_hook=ssh_hook,
39 |         dag=dag)
40 | 
41 |     run_remove = SSHOperator(
42 |         task_id="run_remove_v0",
43 |         command=f"cd {sandbox_db_sync_path}; ./2-run-remove.sh ",
44 |         ssh_hook=ssh_hook,
45 |         dag=dag)
46 | 
47 |     run_append >> run_remove
48 | 
49 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_channel_fids.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | 
20 | with DAG(
21 |     dag_id='sync_sandbox_channel_fids',
22 |     default_args=default_args,
23 |     description='sync globaltrust to the sandbox',
24 |     start_date=datetime(2024, 7, 10, 18),
25 |     # schedule_interval='*/10 * * * *',
26 |     schedule=None,
27 |     is_paused_upon_creation=True,
28 |     max_active_runs=1,
29 |     catchup=False,
30 | ) as dag:
31 |     ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
32 | 
33 |     run_append = SSHOperator(
34 |         task_id="run_append_v1",
35 |         command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -c ",
36 |         ssh_hook=ssh_hook,
37 |         dag=dag)
38 | 
39 |     run_refresh = SSHOperator(
40 |         task_id="run_refresh_v0",
41 |         command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -c ",
42 |         ssh_hook=ssh_hook,
43 |         dag=dag)
44 | 
45 |     run_append >> run_refresh
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_db_dev.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
19 | 
20 | with DAG(
21 |     dag_id='dag_sync_sandbox_db_dev_v0',
22 |     default_args=default_args,
23 |     description='sync the db table of the sandboxed read replica',
24 |     start_date=datetime(2024, 7, 10, 18),
25 |     # schedule_interval='*/10 * * * *',
26 |     schedule_interval=None,
27 |     catchup=False,
28 | ) as dag:
29 |     ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
30 | 
31 |     run_append_dev = SSHOperator(
32 |         task_id="run_append_dev_v0",
33 |         command=f"cd {dev_sandbox_db_sync_path}; ./1-run-append.sh -d 5 ",
34 |         ssh_hook=ssh_hook,
35 |         dag=dag)
36 | 
37 |     run_remove_dev = SSHOperator(
38 |         task_id="run_remove_dev_v0",
39 |         command=f"cd {dev_sandbox_db_sync_path}; ./2-run-remove.sh ",
40 |         ssh_hook=ssh_hook,
41 |         dag=dag)
42 | 
43 |     run_append_dev >> run_remove_dev
44 | 
45 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_globaltrust.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | 
20 | with DAG(
21 |     dag_id='sync_sandbox_globaltrust',
22 |     default_args=default_args,
23 |     description='sync globaltrust to the sandbox',
24 |     start_date=datetime(2024, 7, 10, 18),
25 |     # schedule_interval='*/10 * * * *',
26 |     schedule=None,
27 |     is_paused_upon_creation=True,
28 |     max_active_runs=1,
29 |     catchup=False,
30 | ) as dag:
31 |     ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
32 | 
33 |     run_append = SSHOperator(
34 |         task_id="run_append_v1",
35 |         command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -g ",
36 |         ssh_hook=ssh_hook,
37 |         dag=dag)
38 | 
39 |     run_refresh = SSHOperator(
40 |         task_id="run_refresh_v0",
41 |         command=f"cd {sandbox_db_sync_path}; ./4-run-refresh.sh -g ",
42 |         ssh_hook=ssh_hook,
43 |         dag=dag)
44 | 
45 |     run_append >> run_refresh
46 | 
47 | 


--------------------------------------------------------------------------------
/pipeline/dags/archived/sandbox/dag_sync_sandbox_labels.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.contrib.operators.ssh_operator import SSHOperator
 6 | from airflow.contrib.hooks.ssh_hook import SSHHook
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'coder2j',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | sandbox_db_sync_path = Variable.get("sandbox_db_sync_path")
19 | dev_sandbox_db_sync_path = Variable.get("dev_sandbox_db_sync_path")
20 | 
21 | with DAG(
22 |     dag_id='sync_sandbox_db_labels',
23 |     default_args=default_args,
24 |     description='sync labels to the sandbox',
25 |     start_date=datetime(2024, 7, 10, 18),
26 |     # schedule_interval='*/10 * * * *',
27 |     schedule=None,
28 |     is_paused_upon_creation=True,
29 |     max_active_runs=1,
30 |     catchup=False,
31 | ) as dag:
32 |     ssh_hook = SSHHook(ssh_conn_id='eigen2', keepalive_interval=60, cmd_timeout=None)
33 | 
34 |     run_append = SSHOperator(
35 |         task_id="run_append_v1",
36 |         command=f"cd {sandbox_db_sync_path}; ./1-run-append_v1.sh -l",
37 |         ssh_hook=ssh_hook,
38 |         dag=dag)
39 | 
40 |     run_append 
41 | 
42 | 


--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_direct_cast_join_requests.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.providers.ssh.operators.ssh import SSHHook
 7 | from airflow.providers.ssh.operators.ssh import SSHOperator
 8 | from airflow.decorators import task_group
 9 | 
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 | 
13 | default_args = {
14 |     "owner": "coder2j",
15 |     "retries": 5,
16 |     "retry_delay": timedelta(minutes=2),
17 |     "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 | 
20 | HOST_REPO_URL='cura-bot-2'
21 | 
22 | with DAG(
23 |     dag_id="cura_direct_cast_join_requests",
24 |     default_args=default_args,
25 |     description="Direct cast join requests from curabot",
26 |     start_date=datetime(2024, 11, 7),
27 |     schedule_interval='0 * * * *',
28 |     is_paused_upon_creation=True,
29 |     max_active_runs=1,
30 |     catchup=False,
31 | ) as dag:
32 | 
33 |     ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 | 
35 |     eigen1_install_dependencies = SSHOperator(
36 |         task_id="cura_eigen1_install_deps",
37 |         command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 |         ssh_hook=ssh_hook,
39 |         dag=dag,
40 |     )
41 | 
42 |     eigen1_direct_cast_join_requests = SSHOperator(
43 |         task_id="cura_eigen1_direct_cast_join_requests",
44 |         command=f"cd {HOST_REPO_URL} && npm run script:direct_cast_join_requests",
45 |         ssh_hook=ssh_hook,
46 |         dag=dag,
47 |     )
48 | 
49 |     eigen1_install_dependencies >> eigen1_direct_cast_join_requests
50 | 
51 | 


--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_run_autoinvite_rules.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.providers.ssh.operators.ssh import SSHHook
 7 | from airflow.providers.ssh.operators.ssh import SSHOperator
 8 | from airflow.decorators import task_group
 9 | 
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 | 
13 | default_args = {
14 |     "owner": "coder2j",
15 |     "retries": 5,
16 |     "retry_delay": timedelta(minutes=2),
17 |     "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 | 
20 | HOST_REPO_URL='cura-bot-3'
21 | 
22 | with DAG(
23 |     dag_id="cura_run_autoinvite_rules",
24 |     default_args=default_args,
25 |     description="Run all the autoinvite rules",
26 |     start_date=datetime(2024, 11, 7),
27 |     schedule_interval='0 */4 * * *',
28 |     is_paused_upon_creation=True,
29 |     max_active_runs=1,
30 |     catchup=False,
31 | ) as dag:
32 | 
33 |     ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 | 
35 |     eigen1_install_dependencies = SSHOperator(
36 |         task_id="cura_eigen1_install_deps",
37 |         command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 |         ssh_hook=ssh_hook,
39 |         dag=dag,
40 |     )
41 | 
42 |     eigen1_run_autoinvite = SSHOperator(
43 |         task_id="cura_eigen1_run_autoinvite",
44 |         command=f"cd {HOST_REPO_URL} && npm run script:autoinvite",
45 |         ssh_hook=ssh_hook,
46 |         dag=dag,
47 |     )
48 | 
49 |     eigen1_install_dependencies >> eigen1_run_autoinvite
50 | 


--------------------------------------------------------------------------------
/pipeline/dags/cura/dag_run_quote_casts.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.models import Variable
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.providers.ssh.operators.ssh import SSHHook
 7 | from airflow.providers.ssh.operators.ssh import SSHOperator
 8 | from airflow.decorators import task_group
 9 | 
10 | from hooks.discord import send_alert_discord
11 | from hooks.pagerduty import send_alert_pagerduty
12 | 
13 | default_args = {
14 |     "owner": "coder2j",
15 |     "retries": 5,
16 |     "retry_delay": timedelta(minutes=2),
17 |     "on_failure_callback": [send_alert_discord, send_alert_pagerduty],
18 | }
19 | 
20 | HOST_REPO_URL='cura-bot-1'
21 | 
22 | with DAG(
23 |     dag_id="cura_run_quote_casts",
24 |     default_args=default_args,
25 |     description="Quote a cast and post it from curabot",
26 |     start_date=datetime(2024, 11, 7),
27 |     schedule_interval='0 0 * * 5',
28 |     is_paused_upon_creation=True,
29 |     max_active_runs=1,
30 |     catchup=False,
31 | ) as dag:
32 | 
33 |     ssh_hook = SSHHook(ssh_conn_id='eigen1', keepalive_interval=60, cmd_timeout=None)
34 | 
35 |     eigen1_install_dependencies = SSHOperator(
36 |         task_id="cura_eigen1_install_deps",
37 |         command=f"cd {HOST_REPO_URL} && git reset --hard HEAD && git pull origin main && pnpm i",
38 |         ssh_hook=ssh_hook,
39 |         dag=dag,
40 |     )
41 | 
42 |     eigen1_run_quote_casts = SSHOperator(
43 |         task_id="cura_eigen1_run_quote_casts",
44 |         command=f"cd {HOST_REPO_URL} && npm run script:quote_casts",
45 |         ssh_hook=ssh_hook,
46 |         dag=dag,
47 |     )
48 | 
49 |     eigen1_install_dependencies >> eigen1_run_quote_casts
50 | 
51 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_backup_to_s3_v1.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from airflow.sensors.external_task import ExternalTaskSensor
 6 | 
 7 | from hooks.discord import send_alert_discord
 8 | from hooks.pagerduty import send_alert_pagerduty
 9 | 
10 | default_args = {
11 |     'owner': 'coder2j',
12 |     'retries': 5,
13 |     'retry_delay': timedelta(minutes=2),
14 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 | 
17 | 
18 | with DAG(
19 |     dag_id='backup_to_s3_v1',
20 |     default_args=default_args,
21 |     description='This backs up globaltrust, localtrust and channel_ranking into s3',
22 |     start_date=datetime(2024, 8, 15),
23 |     schedule_interval='30 20 * * *',
24 |     catchup=False,
25 | ) as dag:
26 |     
27 |     task1 = BashOperator(
28 |         task_id='backup_globaltrust',
29 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust"
30 |     )
31 | 
32 |     task2 = BashOperator(
33 |         task_id='backup_globaltrust_config',
34 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh globaltrust_config"
35 |     )
36 | 
37 |     task3 = BashOperator(
38 |         task_id='backup_localtrust',
39 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh localtrust_v1 /pipeline/tmp/graph_files"
40 |     )
41 | 
42 |     [task1, task2, task3]
43 | 
44 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_daily_trending.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta, timezone
 2 | import pytz
 3 | 
 4 | from airflow import DAG
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.operators.empty import EmptyOperator
 7 | from airflow.decorators import task
 8 | 
 9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 | 
12 | default_args = {
13 |     'owner': 'karma3labs',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 | 
19 | def _9ampacific_in_utc_time():
20 |     pacific_tz = pytz.timezone('US/Pacific')
21 |     pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
22 |     pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
23 |     utc_time = pacific_time.astimezone(pytz.utc)
24 |     return utc_time
25 | 
26 | with DAG(
27 |     dag_id='notify_channel_daily_trending',
28 |     default_args=default_args,
29 |     description='daily notifications for trending channels',
30 |     start_date=datetime(2024, 7, 10, 18),
31 |     schedule_interval='30 16 * * *', # every day at 16:30/17:30 UTC / 09:30 Pacific 
32 |     is_paused_upon_creation=True,
33 |     max_active_runs=1,
34 |     catchup=False,
35 | ) as dag:
36 | 
37 |     skip_notify = EmptyOperator(task_id="skip_notify")
38 | 
39 |     notify = BashOperator(
40 |             task_id="notify",
41 |             bash_command=(
42 |                 "cd /pipeline && ./run_notify_channel_daily_trending.sh "
43 |                 " -w . -v .venv -c channels/Trending_Channels.csv "),
44 |             dag=dag)
45 |     
46 |     @task.branch(task_id="check_last_successful")
47 |     def check_last_successful(**context) -> bool:
48 |         now = datetime.now(pytz.utc)
49 |         prev_run_date = context['prev_data_interval_end_success']
50 |         daily_run = _9ampacific_in_utc_time()
51 |         print(f"now: {now}, prev_run_date: {prev_run_date}, daily_run: {daily_run}")
52 |         if (
53 |             now > daily_run
54 |             and (prev_run_date is None or prev_run_date < daily_run)
55 |         ):
56 |             # Last successful run was before today, so we should run
57 |             print(f"Last run {prev_run_date} was before {daily_run}, so we should run")
58 |             return "notify"
59 |         return "skip_notify"
60 | 
61 |     check_last_successful = check_last_successful()
62 | 
63 |     check_last_successful  >> skip_notify
64 | 
65 |     check_last_successful  >> notify
66 | 
67 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_leaderboard.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta, timezone
 2 | import pytz
 3 | 
 4 | from airflow import DAG
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.operators.empty import EmptyOperator
 7 | from airflow.decorators import task
 8 | 
 9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 | 
12 | default_args = {
13 |     'owner': 'karma3labs',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 | 
19 | def _monday_9ampacific_in_utc_time():
20 |     pacific_tz = pytz.timezone('US/Pacific')
21 |     pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
22 |     pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
23 |     utc_time = pacific_time.astimezone(pytz.utc)
24 |     monday_utc_time = utc_time - timedelta(days=utc_time.weekday() - 0) 
25 |     return monday_utc_time
26 | 
27 | with DAG(
28 |     dag_id='notify_channel_leaderboard',
29 |     default_args=default_args,
30 |     description='channel notifications started by trigger dag or manually',
31 |     start_date=datetime(2024, 7, 10, 18),
32 |     schedule_interval=None,
33 |     is_paused_upon_creation=True,
34 |     max_active_runs=1,
35 |     catchup=False,
36 | ) as dag:
37 | 
38 |     skip_notify = EmptyOperator(task_id="skip_notify")
39 | 
40 |     notify = BashOperator(
41 |             task_id="notify",
42 |             bash_command="cd /pipeline && ./run_notify_channel_leaderboard.sh  -w . -v .venv -r ",
43 |             dag=dag)
44 |     
45 |     @task.branch(task_id="check_last_successful")
46 |     def check_last_successful(**context) -> bool:
47 |         now = datetime.now(pytz.utc)
48 |         prev_run_date = context['prev_data_interval_start_success']
49 |         weekly_run = _monday_9ampacific_in_utc_time()
50 |         print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}")
51 |         if (
52 |             now > weekly_run
53 |             and (prev_run_date is None or prev_run_date < weekly_run)
54 |         ):
55 |             # Last successful run was before 9am on Monday, so we should run
56 |             print(f"Last run {prev_run_date} was before {weekly_run}, so we should run")
57 |             return "notify"
58 |         return "skip_notify"
59 | 
60 |     check_last_successful = check_last_successful()
61 | 
62 |     check_last_successful  >> skip_notify
63 | 
64 |     check_last_successful  >> notify
65 | 
66 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_notify_channel_weekly_mods.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | import pytz
 3 | 
 4 | from airflow import DAG
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.operators.empty import EmptyOperator
 7 | from airflow.decorators import task
 8 | 
 9 | from hooks.discord import send_alert_discord
10 | from hooks.pagerduty import send_alert_pagerduty
11 | 
12 | default_args = {
13 |     'owner': 'karma3labs',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 | 
19 | def wed_9ampacific_in_utc_time():
20 |     wednesday_dow = 2
21 |     pacific_tz = pytz.timezone('US/Pacific')
22 |     pacific_9am_str = ' '.join([datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
23 |     pacific_time = pacific_tz.localize(datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
24 |     utc_time = pacific_time.astimezone(pytz.utc) 
25 |     return utc_time - timedelta(days=utc_time.weekday() - wednesday_dow)
26 | 
27 | with DAG(
28 |     dag_id='notify_channel_weekly_mods',
29 |     default_args=default_args,
30 |     description='weekly notifications to mods',
31 |     start_date=datetime(2024, 7, 10, 18),
32 |     schedule_interval='30 16 * * 3', # every Wednesday at 16:30/17:30 UTC / 09:30 Pacific 
33 |     is_paused_upon_creation=True,
34 |     max_active_runs=1,
35 |     catchup=False,
36 | ) as dag:
37 | 
38 |     skip_notify = EmptyOperator(task_id="skip_notify")
39 | 
40 |     notify = BashOperator(
41 |             task_id="notify",
42 |             bash_command=(
43 |                 "cd /pipeline && ./run_notify_channel_weekly_mods.sh "
44 |                 " -w . -v .venv -b channels/Bot_Fids.csv -s '{{ prev_data_interval_end_success }}'"),
45 |             dag=dag)
46 |     
47 |     @task.branch(task_id="check_last_successful")
48 |     def check_last_successful(**context) -> bool:
49 |         now = datetime.now(pytz.utc)
50 |         prev_run_date = context['prev_data_interval_end_success']
51 |         weekly_run = wed_9ampacific_in_utc_time() 
52 |         print(f"now: {now}, prev_run_date: {prev_run_date}, weekly_run: {weekly_run}")
53 |         if (
54 |             now > weekly_run
55 |             and (prev_run_date is None or prev_run_date < weekly_run)
56 |         ):
57 |             # Last successful run was before today, so we should run
58 |             print(f"Last run {prev_run_date} was before {weekly_run}, so we should run")
59 |             return "notify"
60 |         return "skip_notify"
61 | 
62 |     check_last_successful = check_last_successful()
63 | 
64 |     check_last_successful  >> skip_notify
65 | 
66 |     check_last_successful  >> notify
67 | 
68 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_refresh_rank_view_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from airflow.sensors.external_task import ExternalTaskSensor
 6 | 
 7 | from hooks.discord import send_alert_discord
 8 | from hooks.pagerduty import send_alert_pagerduty
 9 | 
10 | default_args = {
11 |     'owner': 'coder2j',
12 |     'retries': 5,
13 |     'retry_delay': timedelta(minutes=2),
14 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 | 
17 | with DAG(
18 |     dag_id='refresh_rank_view_v0',
19 |     default_args=default_args,
20 |     description='This refreshes k3l_rank materialized view and vacuums k3l_rank table',
21 |     start_date=datetime(2024, 7, 9, 18),
22 |     # schedule_interval='0 1-23/6 * * *',
23 |     schedule=None,
24 |     catchup=False,
25 | ) as dag:
26 |         
27 |     task1 = BashOperator(
28 |         task_id='refresh_view_k3l_rank_e8',
29 |         bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
30 |         REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank; "
31 |         '''
32 |     )
33 | 
34 |     task2 = BashOperator(
35 |         task_id='vacuum_k3l_rank_e8',
36 |         bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
37 |         VACUUM ANALYZE k3l_rank; "
38 |         '''
39 |     )
40 | 
41 |     task1 >> task2 
42 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_run_cast_pipeline_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta, timezone  
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from airflow.decorators import task
 6 | 
 7 | from hooks.discord import send_alert_discord
 8 | from hooks.pagerduty import send_alert_pagerduty
 9 | 
10 | default_args = {
11 |     'owner': 'coder2j',
12 |     'retries': 5,
13 |     'retry_delay': timedelta(minutes=2),
14 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
15 | }
16 | 
17 | with DAG(
18 |     dag_id='run_cast_pipeline_v0',
19 |     default_args=default_args,
20 |     description='extract cast interactions and refresh pg statistics',
21 |     start_date=datetime(2024, 7, 9, 18),
22 |     # schedule_interval='*/10 * * * *',
23 |     schedule_interval=timedelta(minutes=5),
24 |     max_active_runs=1,
25 |     is_paused_upon_creation=True,
26 |     catchup=False,
27 | ) as dag:
28 |     
29 |     insert = BashOperator(
30 |         task_id='insert_cast_actions',
31 |         bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ '
32 |     )
33 | 
34 |     insert8 = BashOperator(
35 |         task_id='insert_cast_actions_e8',
36 |         bash_command='cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/ -p eigen8 '
37 |     )
38 | 
39 |     refresh = BashOperator(
40 |         task_id='refresh_parent_casts_view',
41 |         bash_command='''cd /pipeline/ && ./run_eigen2_postgres_sql.sh -w . "
42 |         REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;"
43 |         '''
44 |     )
45 | 
46 |     refresh8 = BashOperator(
47 |         task_id='refresh_parent_casts_view_e8',
48 |         bash_command='''cd /pipeline/ && ./run_eigen8_postgres_sql.sh -w . "
49 |         REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_recent_parent_casts;"
50 |         '''
51 |     )
52 | 
53 |     @task.bash
54 |     def gapfill_task(db: str) -> str:
55 |         yesterday = datetime.now(timezone.utc) - timedelta(hours=25)
56 |         return f"cd /pipeline/ && ./run_cast_pipeline.sh -v ./.venv/"\
57 |             f" -f gapfill -p {db} -t '{yesterday.strftime('%Y-%m-%d %H:%M:%S')}'"
58 | 
59 |     gapfill = gapfill_task.override(task_id='gapfill_cast_actions')('eigen2')
60 |     gapfill8 = gapfill_task.override(task_id='gapfill_cast_actions_e8')('eigen8')
61 | 
62 |     insert >> refresh >> gapfill
63 |     insert8 >> refresh8 >> gapfill8
64 | 


--------------------------------------------------------------------------------
/pipeline/dags/dag_update_channel_points.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | from hooks.discord import send_alert_discord
 7 | from hooks.pagerduty import send_alert_pagerduty
 8 | 
 9 | default_args = {
10 |     'owner': 'karma3labs',
11 |     'retries': 5,
12 |     'retry_delay': timedelta(minutes=2),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | with DAG(
17 |     dag_id='update_channel_points_v2',
18 |     default_args=default_args,
19 |     description='update channel points triggered by update_channel_tokens dag',
20 |     start_date=datetime(2024, 7, 10, 18),
21 |     schedule_interval='0 16 * * *', # every day at 17:00 UTC / 09:00 Pacific 
22 |     # schedule_interval=timedelta(days=1),
23 |     # schedule=None, 
24 |     is_paused_upon_creation=True,
25 |     max_active_runs=1,
26 |     catchup=False,
27 | ) as dag:
28 | 
29 |     # run_genesis = BashOperator(
30 |     #     task_id="run_genesis",
31 |     #     bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t genesis",
32 |     #     dag=dag)
33 |     
34 |     # daily_calc = BashOperator(
35 |     #     task_id="daily_calc",
36 |     #     bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t compute",
37 |     #     dag=dag)
38 |     
39 |     # balance_update = BashOperator(
40 |     #     task_id="balance_update",
41 |     #     bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t update",
42 |     #     dag=dag)
43 | 
44 |     # run_genesis8 = BashOperator(
45 |     #     task_id="run_genesis8",
46 |     #     bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t genesis -p eigen8",
47 |     #     dag=dag)
48 |     
49 |     daily_calc8 = BashOperator(
50 |         task_id="daily_calc8",
51 |         bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t compute -p eigen8",
52 |         dag=dag)
53 |     
54 |     balance_update8 = BashOperator(
55 |         task_id="balance_update8",
56 |         bash_command="cd /pipeline && ./run_update_channel_points.sh  -w . -v .venv -t update -p eigen8",
57 |         dag=dag)
58 | 
59 |     backup_to_s3 = BashOperator(
60 |             task_id='backup_channel_points_bal',
61 |             bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh backup_channel_points_bal "
62 |         )
63 | 
64 |     # run_genesis >> daily_calc >> balance_update >> backup_to_s3
65 |     # run_genesis8 >> daily_calc8 >> balance_update8
66 |     daily_calc8 >> balance_update8 >> backup_to_s3
67 | 


--------------------------------------------------------------------------------
/pipeline/dags/extractors/dag_cura_mod.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | from hooks.discord import send_alert_discord
 7 | from hooks.pagerduty import send_alert_pagerduty
 8 | 
 9 | default_args = {
10 |     "owner": "karma3labs",
11 |     "retries": 1,
12 |     "retry_delay": timedelta(minutes=5),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | with DAG(
17 |     "extract_cura_mod",
18 |     default_args=default_args,
19 |     description="Fetch hidden fids from CURA API and load into DB daily",
20 |     schedule_interval=timedelta(minutes=5),
21 |     # schedule_interval=None,
22 |     start_date=datetime(2024, 8, 1),
23 |     is_paused_upon_creation=True,
24 |     max_active_runs=1,
25 |     catchup=False,
26 | ) as dag:
27 | 
28 |     fetch_task = BashOperator(
29 |         task_id='extract_cura_hidden_fids',
30 |         bash_command="cd /pipeline; extractors/extract_cura_mod.sh -w . -v .venv -r ",
31 |         dag=dag
32 |     )
33 | 
34 |     fetch_task


--------------------------------------------------------------------------------
/pipeline/dags/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/monitoring/__init__.py


--------------------------------------------------------------------------------
/pipeline/dags/one_off/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/dags/one_off/.placeholder


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_gen_globaltrust_by_date_v0.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from hooks.discord import send_alert_discord
 6 | 
 7 | 
 8 | default_args = {
 9 |     'owner': 'coder2j',
10 |     'retries': 5,
11 |     'retry_delay': timedelta(minutes=2),
12 |     # 'on_failure_callback': send_alert_discord,
13 | }
14 | 
15 | # 2024-06-04 00:00
16 | # 875822
17 | # 2024-06-05 00:00
18 | # 875822
19 | # 2024-06-11 00:00
20 | # 921037
21 | # 2024-06-12 00:00
22 | # 921037
23 | # 2024-06-15 00:00
24 | # 960387
25 | # 2024-06-16 00:00
26 | # 960387
27 | with DAG(
28 |     dag_id='one_off_gen_globaltrust_by_date_v0',
29 |     default_args=default_args,
30 |     description='This runs run_globaltrust_pipeline.sh without any optimization',
31 |     schedule_interval=None,
32 |     start_date=None,
33 |     is_paused_upon_creation=True,
34 |     max_active_runs=1,
35 |     catchup=False,
36 | ) as dag:
37 |     push_to_dune = BashOperator(
38 |         task_id='push_to_dune',
39 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3 "
40 |     )
41 | 
42 |     task1 = BashOperator(
43 |         task_id='06-05',
44 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-05"
45 |     )
46 | 
47 |     task2 = BashOperator(
48 |         task_id='06-12',
49 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-12"
50 |     )
51 | 
52 |     task3 = BashOperator(
53 |         task_id='06-16',
54 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-16"
55 |     )
56 | 
57 |     task5 = BashOperator(
58 |         task_id='06-04',
59 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-04"
60 |     )
61 | 
62 |     task6 = BashOperator(
63 |         task_id='06-11',
64 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-11"
65 |     )
66 | 
67 |     task7 = BashOperator(
68 |         task_id='06-15',
69 |         bash_command="cd /pipeline && ./run_globaltrust_pipeline.sh -w . -v ./.venv -d 2024-06-15 "
70 |     )
71 | 
72 |     task1 >> task2 >> task3 >> push_to_dune >> task5 >> task6 >> task7
73 | 
74 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_gen_globaltrust_by_date_v1.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.utils.trigger_rule import TriggerRule
 5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 6 | from airflow.operators.bash import BashOperator
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | 
12 | default_args = {
13 |     'owner': 'karma3labs',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
17 | }
18 | 
19 | with DAG(
20 |     dag_id='one_off_gen_globaltrust_by_date_v1',
21 |     default_args=default_args,
22 |     description='This runs run_globaltrust_pipeline.sh without any optimization',
23 |     start_date=datetime(2024, 8, 16),
24 |     schedule_interval=None,
25 |     is_paused_upon_creation=True,
26 |     max_active_runs=1,
27 |     catchup=False,
28 | ) as dag:
29 | 
30 |     mkdir_tmp =  BashOperator(
31 |         task_id="mkdir_tmp",
32 |         bash_command= "cd /pipeline; mkdir -p tmp/{{ run_id }}; mkdir -p tmp/graph_files",
33 |         dag=dag)
34 | 
35 |     prep_globaltrust = BashOperator(
36 |         task_id="prep_globaltrust",
37 |         bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s prep"
38 |                         " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
39 |         dag=dag)
40 | 
41 |     compute_engagement = BashOperator(
42 |         task_id="compute_engagement",
43 |         bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s compute_engagement"
44 |                         " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
45 |         dag=dag)
46 |     
47 | 
48 |     insert_db = BashOperator(
49 |         task_id="insert_db",
50 |         bash_command= "cd /pipeline; ./run_globaltrust_pipeline.sh -s insert_db"
51 |                         " -w . -v ./.venv -t tmp/{{ run_id }} -o tmp/graph_files/ -d 2024-10-26",
52 |         dag=dag)
53 |     
54 |     upload_to_dune =  BashOperator(
55 |         task_id="upload_to_dune",
56 |         bash_command= "cd /pipeline/dags/pg_to_dune; ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3",
57 |         dag=dag)
58 | 
59 |     trigger_refresh_views = TriggerDagRunOperator(
60 |         task_id="trigger_refresh_views",
61 |         trigger_dag_id="refresh_rank_view_v0",
62 |         conf={"trigger": "gen_globaltrust_v1"},
63 |     )
64 | 
65 |     # trigger_sync_sandbox = TriggerDagRunOperator(
66 |     #     task_id="trigger_sync_sandbox",
67 |     #     trigger_dag_id="sync_sandbox_globaltrust",
68 |     #     conf={"trigger": "gen_globaltrust_v1"},
69 |     # )
70 | 
71 |     (
72 |         mkdir_tmp
73 |         >> prep_globaltrust
74 |         >> compute_engagement
75 |         >> insert_db
76 |         >> upload_to_dune
77 |         >> trigger_refresh_views
78 |         # >> trigger_sync_sandbox
79 |     )
80 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_insert_to_dune_table.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from hooks.discord import send_alert_discord
 6 | from hooks.pagerduty import send_alert_pagerduty
 7 | 
 8 | 
 9 | default_args = {
10 |     'owner': 'coder2j',
11 |     'retries': 5,
12 |     'retry_delay': timedelta(minutes=2),
13 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
14 | }
15 | 
16 | 
17 | with DAG(
18 |     dag_id='one_off_insert_to_dune_tables',
19 |     default_args=default_args,
20 |     description='This inserts globaltrust and channel_ranking into dune',
21 |     schedule_interval=None,
22 |     start_date=None,
23 |     is_paused_upon_creation=True,
24 |     max_active_runs=1,
25 |     catchup=False,
26 | ) as dag:
27 |     task4 = BashOperator(
28 |         task_id='overwrite_globaltrust_in_dune_v3',
29 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_globaltrust_in_dune_v3"
30 |     )
31 | 
32 |     task5 = BashOperator(
33 |         task_id='overwrite_channel_rank_in_dune_v3',
34 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh overwrite_channel_rank_in_dune_v3"
35 |     )
36 | 
37 |     [task4, task5]
38 | 
39 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_migrate_dune_table.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | 
 6 | 
 7 | default_args = {
 8 |     'owner': 'coder2j',
 9 |     'retries': 5,
10 |     'retry_delay': timedelta(minutes=2)
11 | }
12 | 
13 | 
14 | with DAG(
15 |     dag_id='one_off_migrate_dune_table',
16 |     default_args=default_args,
17 |     description='This backs up globaltrust, localtrust and channel_ranking into s3',
18 |     schedule_interval=None,
19 |     start_date=None,
20 |     is_paused_upon_creation=True,
21 |     max_active_runs=1,
22 |     catchup=False,
23 | ) as dag:
24 |     task1 = BashOperator(
25 |         task_id='create_dune_globaltrust_table',
26 |         bash_command="cd /pipeline/dags/pg_to_dune && ./upload_to_dune.sh create_dune_globaltrust_table dataset_k3l_cast_globaltrust_v2"
27 |     )
28 | 
29 |     [task1]
30 | 
31 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_branch.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | import pytz
 4 | import datetime
 5 | from airflow import DAG
 6 | from airflow.utils.trigger_rule import TriggerRule
 7 | from airflow.operators.empty import EmptyOperator
 8 | from airflow.operators.python import PythonOperator
 9 | 
10 | from airflow.decorators import task, task_group
11 | 
12 | default_args = {
13 |     'owner': 'karma3labs',
14 |     'retries': 5,
15 |     'retry_delay': timedelta(minutes=2),
16 | }
17 | 
18 | def _monday_9ampacific_in_utc_time():
19 |     pacific_tz = pytz.timezone('US/Pacific')
20 |     pacific_9am_str = ' '.join([datetime.datetime.now(pacific_tz).strftime("%Y-%m-%d"),'09:00:00'])
21 |     pacific_time = pacific_tz.localize(datetime.datetime.strptime(pacific_9am_str, '%Y-%m-%d %H:%M:%S'))
22 |     utc_time = pacific_time.astimezone(pytz.utc)
23 |     monday = utc_time - timedelta(days=utc_time.weekday())
24 |     return monday
25 | 
26 | with DAG(
27 |     dag_id='one_off_trial_branch',
28 |     default_args=default_args,
29 |     description="One off dag to test new features",
30 |     schedule_interval=None,
31 |     start_date=None,
32 |     is_paused_upon_creation=True,
33 |     max_active_runs=1,
34 |     catchup=False,
35 | ) as dag:
36 |     
37 |     @task.branch(task_id="branch")
38 |     def branch_fn(**context):
39 |         print(f"context: {context}")
40 |         prev = context['prev_execution_date_success']
41 |         print(f"prev_execution_date_success: {prev}")
42 |         if prev > _monday_9ampacific_in_utc_time():
43 |             return "t2"
44 |         return "t1"
45 |     
46 |     def empty_fn(*args, **kwargs):
47 |         pass
48 | 
49 |     branch = branch_fn()
50 |     t1 = EmptyOperator(task_id="t1")
51 |     t2 = EmptyOperator(task_id="t2")
52 |     
53 | 
54 |     @task_group(group_id='all_group')
55 |     def tg_all():
56 |         always = PythonOperator(task_id="always", 
57 |                              python_callable=empty_fn, 
58 |                              op_args=[],
59 |                              op_kwargs={},
60 |                              trigger_rule=TriggerRule.ALL_SUCCESS)
61 |         t3 = EmptyOperator(task_id="t3")
62 | 
63 |         always >> t3
64 | 
65 |     @task_group(group_id='some_group')
66 |     def tg_some():
67 |         always = PythonOperator(task_id="always", 
68 |                              python_callable=empty_fn, 
69 |                              op_args=[],
70 |                              op_kwargs={},
71 |                              trigger_rule=TriggerRule.ALL_SUCCESS)
72 |         sometimes = EmptyOperator(task_id="sometimes")
73 |         t3 = EmptyOperator(task_id="t3")
74 | 
75 |         always >> sometimes >> t3
76 | 
77 |     branch >> t1 >> tg_all()
78 |     branch >> t2 >> tg_some()
79 | 
80 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_sql.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.empty import EmptyOperator
 5 | from airflow.providers.common.sql.operators.sql import SQLCheckOperator
 6 | 
 7 | default_args = {
 8 |     "owner": "karma3labs",
 9 |     "retries": 0,
10 |     "retry_delay": timedelta(minutes=5),
11 | }
12 | 
13 | _CONN_ID = "eig2_readonly_user"
14 | CHECK_QUERY = """
15 |     WITH 
16 |     channel_rank_stats AS (
17 |         SELECT 
18 |             COUNT(*) AS tot_rows, 
19 |             COUNT(DISTINCT channel_id) AS tot_channels,
20 |             strategy_name
21 |         FROM k3l_channel_rank
22 |         GROUP BY strategy_name
23 |     ),
24 |     channel_fids_stats as (
25 |         SELECT 
26 |             COUNT(*) AS tot_rows, 
27 |             COUNT(DISTINCT channel_id) AS tot_channels,
28 |             strategy_name
29 |         -- TODO change table name to k3l_channel_fids
30 |         FROM k3l_channel_rank
31 |         GROUP BY strategy_name
32 |     )
33 |     SELECT 
34 |         BOOL_AND(
35 |                 t2.tot_rows >= t1.tot_rows 
36 |                 AND t2.tot_channels >= t1.tot_channels
37 |                 AND t2.strategy_name IS NOT NULL
38 |                 )
39 |     FROM channel_rank_stats as t1
40 |     LEFT JOIN channel_fids_stats as t2 ON (t2.strategy_name = t1.strategy_name)
41 | """
42 | 
43 | with DAG(
44 |     "one_off_trial_sql",
45 |     default_args=default_args,
46 |     description="One off dag to test new features",
47 |     schedule_interval=None,
48 |     start_date=None,
49 |     is_paused_upon_creation=True,
50 |     max_active_runs=1,
51 |     catchup=False,
52 | ) as dag:
53 | 
54 |     start = EmptyOperator(task_id="start")
55 | 
56 |     sql_check = SQLCheckOperator(
57 |         task_id='sql_check',
58 |         sql=CHECK_QUERY,
59 |         conn_id=_CONN_ID
60 |     )
61 | 
62 |     end = EmptyOperator(task_id="end")
63 | 
64 |     start >> sql_check >> end
65 | 


--------------------------------------------------------------------------------
/pipeline/dags/one_off/dag_trial_task_groups.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.empty import EmptyOperator
 5 | from airflow.operators.bash import BashOperator
 6 | from airflow.decorators import task_group
 7 | 
 8 | default_args = {
 9 |     "owner": "karma3labs",
10 |     "retries": 1,
11 |     "retry_delay": timedelta(minutes=5),
12 | }
13 | 
14 | with DAG(
15 |     "one_off_trial_task_groups",
16 |     default_args=default_args,
17 |     description="One off dag to test new features",
18 |     schedule_interval=None,
19 |     start_date=None,
20 |     is_paused_upon_creation=True,
21 |     max_active_runs=1,
22 |     catchup=False,
23 | ) as dag:
24 | 
25 |     @task_group(group_id='my_start_group')
26 |     def tg_start():
27 |         start = EmptyOperator(task_id="start")
28 | 
29 |         echo1 =  BashOperator(
30 |             task_id="echo1",
31 |             bash_command= "echo {{ (logical_date - macros.timedelta(days=90)) | ds }}",
32 |             dag=dag
33 |         )
34 | 
35 |         echo2 =  BashOperator(
36 |             task_id="echo2",
37 |             bash_command= "echo '{{ prev_data_interval_end_success }}'",
38 |             dag=dag
39 |         )
40 | 
41 |         start >> echo1 >> echo2
42 |         
43 |     @task_group(group_id='my_echo_group')
44 |     def tg_echo():
45 | 
46 |         echo3 =  BashOperator(
47 |             task_id="echo3",
48 |             bash_command= "echo {{ macros.ds_add(ds, -90) }}",
49 |             dag=dag
50 |         )
51 | 
52 |         echo4 =  BashOperator(
53 |             task_id="echo4",
54 |             bash_command= "echo {{ ds }}",
55 |             dag=dag
56 |         )
57 | 
58 |         echo5 =  BashOperator(
59 |             task_id="echo5",
60 |             bash_command= "echo {{ logical_date }}",
61 |             dag=dag
62 |         )
63 |         echo3 >> echo4
64 |         echo5
65 | 
66 |     end = EmptyOperator(task_id="end")
67 | 
68 |     tg_start() >> tg_echo() >> end
69 | 
70 | 


--------------------------------------------------------------------------------
/pipeline/dags/pg_to_dune/.env.sample:
--------------------------------------------------------------------------------
 1 | DB_HOST=localhost
 2 | DB_PORT=5432
 3 | DB_NAME=farcaster
 4 | DB_SSLMODE=allow
 5 | DB_USERNAME=k3l_user
 6 | DB_PASSWORD=changeme
 7 | AWS_ACCESS_KEY_ID="changeme"
 8 | AWS_SECRET_ACCESS_KEY="changeme"
 9 | AWS_REGION="eu-central-1"
10 | GCP_TASK_ACCT="changeme"
11 | GCS_BUCKET_NAME="changeme"
12 | S3_BUCKET_NAME_CONSTANT="changeme"
13 | DUNE_API_KEY="changeme"


--------------------------------------------------------------------------------
/pipeline/dags/pg_to_dune/app/check_last_timestamp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os, json
 3 | from dune_client.types import QueryParameter
 4 | from dune_client.client import DuneClient
 5 | from dune_client.query import QueryBase
 6 | 
 7 | # change the current working directory where .env file lives
 8 | # os.chdir("/Users/abc/local-Workspace/python-notebook-examples")
 9 | # load .env file
10 | # dotenv.load_dotenv(".env")
11 | # setup Dune Python client
12 | dune = DuneClient(os.environ["DUNE_API_KEY"])
13 | 
14 | query = QueryBase(
15 |     name="fetch last date of globaltrust_v2",
16 |     query_id=int(os.environ["QUERY_ID"]),
17 | )
18 | 
19 | result = dune.run_query(
20 |     query = query,
21 |     # performance = 'large' # optionally define which tier to run the execution on (default is "medium")
22 | )
23 | 
24 | if len(result.result.rows) != 1:
25 |   raise "not one"
26 | 
27 | last_date = result.result.rows[0][os.environ["FILTER_COLUMN"]]
28 | print(last_date)
29 | # # go over the results returned
30 | # for row in result.result.rows:
31 | #     print('hell')
32 | #     print (row) # as an example we print the rows
33 | 


--------------------------------------------------------------------------------
/pipeline/dags/reports/dag_gen_channel_metrics.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | # from airflow.operators.empty import EmptyOperator
 5 | from airflow.operators.bash import BashOperator
 6 | 
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'karma3labs',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | 
19 | with DAG(
20 |     dag_id='report_gen_metrics',
21 |     default_args=default_args,
22 |     description='this generates channel metrics',
23 |     start_date=datetime(2024, 8, 15),
24 |     schedule_interval='0 */6 * * *',
25 |     is_paused_upon_creation=True,
26 |     max_active_runs=1,
27 |     catchup=False,
28 | ) as dag:
29 | 
30 |     # gen_channel_metrics = EmptyOperator(task_id="gen_channel_metrics")
31 | 
32 |     gen_channel_metrics = BashOperator(
33 |         task_id='gen_channel_metrics',
34 |         bash_command='cd /pipeline/ && ./run_channel_metrics.sh -w . -v ./.venv/ -r '
35 |     )


--------------------------------------------------------------------------------
/pipeline/dags/reports/dag_gen_labels.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash import BashOperator
 5 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 6 | 
 7 | 
 8 | from hooks.discord import send_alert_discord
 9 | from hooks.pagerduty import send_alert_pagerduty
10 | 
11 | default_args = {
12 |     'owner': 'karma3labs',
13 |     'retries': 5,
14 |     'retry_delay': timedelta(minutes=2),
15 |     # 'on_failure_callback': [send_alert_discord, send_alert_pagerduty],
16 | }
17 | 
18 | 
19 | with DAG(
20 |     dag_id='report_gen_labels',
21 |     default_args=default_args,
22 |     description='This fetches spammers and save the list into s3',
23 |     start_date=datetime(2024, 8, 15),
24 |     schedule_interval='0 0 * * *',
25 |     is_paused_upon_creation=True,
26 |     max_active_runs=1,
27 |     catchup=False,
28 | ) as dag:
29 | 
30 |     gen_top_spammers = BashOperator(
31 |         task_id='gen_top_spammers',
32 |         bash_command="cd /pipeline && ./run_fetch_top_spammers.sh -v ./.venv"
33 |     )
34 | 
35 |     gen_top_casters = BashOperator(
36 |         task_id='gen_top_casters',
37 |         bash_command="cd /pipeline && ./run_fetch_top_caster.sh -v ./.venv"
38 |     )
39 | 
40 |     trigger_sync_sandbox = TriggerDagRunOperator(
41 |         task_id="trigger_sync_sandbox",
42 |         trigger_dag_id="sync_sandbox_db_labels",
43 |         conf={"trigger": "report_gen_labels"},
44 |     )
45 | 
46 |     gen_top_spammers >> gen_top_casters >> trigger_sync_sandbox
47 | 
48 | 


--------------------------------------------------------------------------------
/pipeline/dags/triggers/trigger_gen_channel_ranking_v3.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta, timezone
 2 | from airflow.operators.empty import EmptyOperator
 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 4 | from airflow.decorators import task, dag
 5 | from airflow.models import DagRun
 6 | from airflow.utils.state import DagRunState
 7 | 
 8 | default_args = {
 9 |     'owner': 'karma3labs',
10 |     'retries': 5,
11 |     'retry_delay': timedelta(minutes=2),
12 | }
13 | 
14 | N_CHUNKS = 100  # Define the number of chunks
15 | FREQUENCY_H = 12  # Define the frequency in hours
16 | 
17 | @dag(
18 |     dag_id='trigger_gen_channel_ranking_v3',
19 |     default_args=default_args,
20 |     start_date=datetime(2024, 10, 1),
21 |     schedule_interval=timedelta(hours=6),
22 |     is_paused_upon_creation=True,
23 |     max_active_runs=1,
24 |     catchup=False  # To avoid backfilling if not required
25 | )
26 | def create_trigger_dag():
27 |     skip_main_dag = EmptyOperator(task_id="skip_main_dag")
28 | 
29 |     trigger_main_dag = TriggerDagRunOperator(
30 |         task_id='trigger_main_dag',
31 |         trigger_dag_id='gen_channel_ranking_v3',
32 |         execution_date='{{ macros.datetime.now() }}',
33 |         conf={"trigger": "trigger_gen_channel_ranking_v3"},
34 |     )
35 | 
36 |     @task.branch(task_id="check_last_successful_run")
37 |     def check_last_successful_run(**context) -> bool:
38 |         dag_runs = DagRun.find(dag_id="gen_channel_ranking_v3", state=DagRunState.SUCCESS)
39 |         if not dag_runs or len(dag_runs) == 0:
40 |             # No previous runs
41 |             print("No previous runs")
42 |             return "trigger_main_dag"
43 |         print(f"Found {len(dag_runs)} previous runs")
44 |         dag_runs.sort(key=lambda x: x.execution_date, reverse=True)
45 |         print("Last run: ", dag_runs[0]) 
46 |         # Query the last successful DAG run
47 |         last_run = dag_runs[0]
48 |         print("Last run: ", last_run)
49 |         current_time = datetime.now(timezone.utc)
50 |         delta = FREQUENCY_H
51 |         if last_run:
52 |             print("Last run end_date: ", last_run.end_date)
53 |             print("Last run start_date: ", last_run.start_date)
54 |             if last_run.end_date:
55 |                 delta_last = (current_time - last_run.end_date).total_seconds() / 3600
56 |                 delta = min(delta_last, delta)
57 |             if last_run.start_date:
58 |                 delta_last = (current_time - last_run.start_date).total_seconds() / 3600
59 |                 delta = min(delta_last, delta)
60 |         print(f"Delta: {delta}")
61 |         if delta >= FREQUENCY_H:
62 |             # Last run was more than FREQUENCY_H hours ago, so we should run
63 |             print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run")
64 |             return "trigger_main_dag"
65 |         return "skip_main_dag"
66 | 
67 |     check_last_successful_run = check_last_successful_run()
68 | 
69 |     check_last_successful_run >> trigger_main_dag
70 | 
71 |     check_last_successful_run >> skip_main_dag
72 | 
73 | trigger_dag = create_trigger_dag()
74 | 
75 | 


--------------------------------------------------------------------------------
/pipeline/dags/triggers/trigger_gen_channel_ranking_v4.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta, timezone
 2 | from airflow.operators.empty import EmptyOperator
 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 4 | from airflow.decorators import task, dag
 5 | from airflow.models import DagRun
 6 | from airflow.utils.state import DagRunState
 7 | 
 8 | default_args = {
 9 |     'owner': 'karma3labs',
10 |     'retries': 5,
11 |     'retry_delay': timedelta(minutes=2),
12 | }
13 | 
14 | N_CHUNKS = 100  # Define the number of chunks
15 | FREQUENCY_H = 24  # Define the frequency in hours
16 | 
17 | @dag(
18 |     dag_id='trigger_gen_channel_ranking_v4',
19 |     default_args=default_args,
20 |     start_date=datetime(2024, 10, 1),
21 |     schedule_interval=timedelta(hours=24),
22 |     is_paused_upon_creation=True,
23 |     max_active_runs=1,
24 |     catchup=False  # To avoid backfilling if not required
25 | )
26 | def create_trigger_dag():
27 |     skip_main_dag = EmptyOperator(task_id="skip_main_dag")
28 | 
29 |     trigger_main_dag = TriggerDagRunOperator(
30 |         task_id='trigger_main_dag',
31 |         trigger_dag_id='gen_channel_ranking_v4',
32 |         execution_date='{{ macros.datetime.now() }}',
33 |         conf={"trigger": "trigger_gen_channel_ranking_v4"},
34 |     )
35 | 
36 |     @task.branch(task_id="check_last_successful_run")
37 |     def check_last_successful_run(**context) -> bool:
38 |         dag_runs = DagRun.find(dag_id="gen_channel_ranking_v4", state=DagRunState.SUCCESS)
39 |         if not dag_runs or len(dag_runs) == 0:
40 |             # No previous runs
41 |             print("No previous runs")
42 |             return "trigger_main_dag"
43 |         print(f"Found {len(dag_runs)} previous runs")
44 |         dag_runs.sort(key=lambda x: x.execution_date, reverse=True)
45 |         print("Last run: ", dag_runs[0]) 
46 |         # Query the last successful DAG run
47 |         last_run = dag_runs[0]
48 |         print("Last run: ", last_run)
49 |         current_time = datetime.now(timezone.utc)
50 |         delta = FREQUENCY_H
51 |         if last_run:
52 |             print("Last run end_date: ", last_run.end_date)
53 |             print("Last run start_date: ", last_run.start_date)
54 |             if last_run.end_date:
55 |                 delta_last = (current_time - last_run.end_date).total_seconds() / 3600
56 |                 delta = min(delta_last, delta)
57 |             if last_run.start_date:
58 |                 delta_last = (current_time - last_run.start_date).total_seconds() / 3600
59 |                 delta = min(delta_last, delta)
60 |         print(f"Delta: {delta}")
61 |         if delta >= FREQUENCY_H:
62 |             # Last run was more than FREQUENCY_H hours ago, so we should run
63 |             print(f"Last run was more than {FREQUENCY_H} hours ago, so we should run")
64 |             return "trigger_main_dag"
65 |         return "skip_main_dag"
66 | 
67 |     check_last_successful_run = check_last_successful_run()
68 | 
69 |     check_last_successful_run >> trigger_main_dag
70 | 
71 |     check_last_successful_run >> skip_main_dag
72 | 
73 | trigger_dag = create_trigger_dag()
74 | 
75 | 


--------------------------------------------------------------------------------
/pipeline/extractors/automod_extractor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import date
 3 | import requests
 4 | from sqlalchemy import create_engine
 5 | from sqlalchemy import text
 6 | import io
 7 | from loguru import logger
 8 | import sys
 9 | 
10 | 
11 | def fetch_data_from_api(api_key, db_user, db_password, db_endpoint):
12 |     params = {'start': '2024-01-01', 'end': '2024-12-31'}
13 |     headers = {'api-key': f"{api_key}"}
14 |     df_automod = pd.DataFrame()
15 |     for channel in ["degen", "dev", "memes"]:
16 |         initial_url = f"https://automod.sh/api/partners/channels/{channel}/activity/export?"
17 |         response = requests.get(initial_url, params=params, headers=headers)
18 |         print(response.url)
19 |         if response.status_code == 200:
20 |             # Read the response content into a pandas DataFrame
21 |             data = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
22 |             data["channel_id"] = channel
23 |             print(len(data))
24 |             df_automod = pd.concat([df_automod, data], axis=0)
25 |         else:
26 |             raise Exception(f"Failed to fetch data from automod. Status code: {response.status_code}")
27 |     
28 |     if len(df_automod) == 0: 
29 |         raise Exception("Failed to fetch data from automod. No data found.")   
30 | 
31 |     rename_dict = {
32 |         'createdAt': 'created_at',
33 |         'affectedUsername': 'affected_username',
34 |         'affectedUserFid': 'affected_userid',
35 |         'castHash': 'cast_hash',
36 |         'castText': 'cast_text'
37 |     }
38 | 
39 |     df_automod.rename(columns=rename_dict, inplace=True)
40 |     df_automod = df_automod[
41 |         ["created_at", "action", "actor", "affected_username", "affected_userid", "cast_hash", "channel_id"]]
42 |     df_automod['created_at'] = pd.to_datetime(df_automod['created_at'], unit='ms')
43 |     df_automod["date_iso"] = date.today()
44 | 
45 |     logger.info(df_automod.head())
46 |     engine_string = "postgresql+psycopg2://%s:%s@%s:%d/%s" \
47 |                     % (db_user, db_password, db_endpoint, 9541, 'farcaster')
48 | 
49 |     postgres_engine = create_engine(engine_string, connect_args={"connect_timeout": 1000})
50 |     with postgres_engine.begin() as conn:
51 |         conn.execute(text("TRUNCATE TABLE automod_data"))
52 |         df_automod.to_sql('automod_data', con=conn, if_exists='append', index=False)
53 |     return None
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     # Get the parameters from the command line arguments
58 |     if len(sys.argv) != 5:
59 |         raise ValueError("Please provide db_user, db_password, and db_endpoint as arguments.")
60 | 
61 |     api_key = sys.argv[1]
62 |     db_user = sys.argv[2]
63 |     db_password = sys.argv[3]
64 |     db_endpoint = sys.argv[4]
65 | 
66 |     fetch_data_from_api(api_key, db_user, db_password, db_endpoint)


--------------------------------------------------------------------------------
/pipeline/extractors/extract_channel_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w:v:c:s:d flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |     esac
 9 | done
10 | 
11 | if [ -z "$VENV" ]  || [ -z "$WORK_DIR" ]; then
12 |   echo "Usage:   $0 -w [work_dir] -v [venv] "
13 |   echo ""
14 |   echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv "
15 |   echo ""
16 |   echo "Params:"
17 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
18 |   echo "  [venv]      The path where a python3 virtualenv has been created."
19 |   echo ""
20 |   exit
21 | fi
22 | 
23 | # Setup environment variables
24 | echo "Setting up environment variables"
25 | source $WORK_DIR/.env
26 | 
27 | # Activate
28 | echo "Activating Python 3.12 environment"
29 | source $VENV/bin/activate
30 | 
31 | # Install
32 | echo "Installing requirements"
33 | #pip install -r requirements.txt
34 | 
35 | # Run
36 | echo "Running channel data import"
37 | /usr/bin/env python3 -m extractors.main_channel_data  
38 | 
39 | if [ $? -ne 0 ]; then
40 |   echo "Failed to run script"
41 |   exit 1
42 | fi
43 | 
44 | # Teardown
45 | echo "Deactivating Python 3.12 environment"
46 | deactivate
47 | 


--------------------------------------------------------------------------------
/pipeline/extractors/extract_cura_mod.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w:v:rd flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         r) RUN_FLAG="--run";;
 9 |         d) DRYRUN_FLAG="--dry-run";;
10 |     esac
11 | done
12 | 
13 | if [ -z "$VENV" ]  || [ -z "$WORK_DIR" ] || [ -z "$RUN_FLAG" ]; then
14 |   echo "Usage:   $0 -w [work_dir] -v [venv] -r -d "
15 |   echo ""
16 |   echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r"
17 |   echo "Example: $0 -w . -v /home/ubuntu/farcaster-graph/publisher/.venv -r -d"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
21 |   echo "  [venv]      The path where a python3 virtualenv has been created."
22 |   echo "  [run] Flag to run the script."
23 |   echo "  [dryrun] Flag to run the script in dry-run mode."
24 |   echo ""
25 |   exit
26 | fi
27 | 
28 | set -e
29 | set -o pipefail 
30 | 
31 | # Setup environment variables
32 | echo "Setting up environment variables"
33 | source $WORK_DIR/.env
34 | 
35 | # Activate
36 | echo "Activating Python 3.12 environment"
37 | source $VENV/bin/activate
38 | 
39 | # Install
40 | echo "Installing requirements"
41 | #pip install -r requirements.txt
42 | 
43 | # Run
44 | echo "Running cura channel mod data extractor with flags"
45 | /usr/bin/env python3 -m extractors.cura_mod_extractor $RUN_FLAG $DRYRUN_FLAG
46 | 
47 | if [ $? -ne 0 ]; then
48 |   echo "Failed to run script"
49 |   exit 1
50 | fi
51 | 
52 | # Teardown
53 | echo "Deactivating Python 3.12 environment"
54 | deactivate
55 | 


--------------------------------------------------------------------------------
/pipeline/extractors/main_channel_data.py:
--------------------------------------------------------------------------------
 1 | from config import settings
 2 | import utils
 3 | 
 4 | import requests
 5 | import pandas as pd
 6 | from sqlalchemy import create_engine
 7 | from sqlalchemy import text
 8 | from loguru import logger
 9 | 
10 | 
11 | def fetch_data_from_api():
12 |     initial_url = "https://api.warpcast.com/v2/all-channels"
13 |     response = requests.get(initial_url)
14 | 
15 |     df_warpcast_channels = pd.DataFrame(response.json()["result"]["channels"])
16 |     df_warpcast_channels['createdAt'] = pd.to_datetime(df_warpcast_channels['createdAt'], unit='ms')
17 |     df_warpcast_channels.columns = df_warpcast_channels.columns.str.lower()
18 |     db_column_names = [
19 |         "id",
20 |         "url",
21 |         "name",
22 |         "description",
23 |         "imageurl",
24 |         "headerimageurl",
25 |         "leadfid",
26 |         "moderatorfids",
27 |         "createdat",
28 |         "followercount",
29 |         "membercount",
30 |         "pinnedcasthash",
31 |     ]
32 |     df_warpcast_channels = df_warpcast_channels.filter(items=db_column_names, axis=1)
33 |     logger.info(utils.df_info_to_string(df_warpcast_channels, with_sample=True))
34 | 
35 |     if len(df_warpcast_channels) == 0:
36 |         raise Exception("Failed to fetch data from warpcast. No data found.")
37 | 
38 |     postgres_engine = create_engine(settings.POSTGRES_URL.get_secret_value(), connect_args={"connect_timeout": 1000})
39 |     try:
40 |         with postgres_engine.begin() as conn:
41 |             conn.execute(text("TRUNCATE TABLE warpcast_channels_data"))
42 |             df_warpcast_channels.to_sql('warpcast_channels_data', con=conn, if_exists='append', index=False)
43 |     except Exception as e:
44 |         logger.error(f"Failed to insert data into postgres: {e}")
45 |         raise e
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     fetch_data_from_api()
50 | 


--------------------------------------------------------------------------------
/pipeline/frames/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/frames/__init__.py


--------------------------------------------------------------------------------
/pipeline/frames/frames_db_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from timer import Timer
 4 | 
 5 | import psycopg2
 6 | import psycopg2.extras
 7 | 
 8 | 
 9 | @Timer(name="fetch_unprocessed_urls")
10 | def fetch_unprocessed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]:
11 |   """return will be of the form [(url_id, url)]"""
12 |   fetch_sql = f"""
13 |     SELECT url_id, url
14 |     FROM k3l_url_labels 
15 |     WHERE processed_ts IS NULL
16 |     ORDER BY earliest_cast_dt ASC
17 |     LIMIT {limit}
18 |   """
19 |   with psycopg2.connect(pg_dsn) as conn:
20 |     with conn.cursor() as cursor:
21 |       logger.info(f"Executing: {fetch_sql}")
22 |       cursor.execute(fetch_sql)
23 |       url_records = cursor.fetchall()
24 |       return url_records
25 | 
26 | @Timer(name="update_url_categories")
27 | def update_url_categories(logger: logging.Logger, pg_dsn: str, url_categories: list[tuple]):
28 |   """url_categories should be of the form [(url_id, category)]"""
29 |   update_sql = """
30 |     UPDATE k3l_url_labels as k
31 |     SET processed_ts=now(), category=v.cat
32 |     FROM (VALUES %s) AS v(id, cat)
33 |     WHERE url_id=v.id;
34 |   """
35 |   with psycopg2.connect(pg_dsn) as conn:
36 |     with conn.cursor() as cursor:
37 |       logger.info(f"Executing: {update_sql}")
38 |       psycopg2.extras.execute_values(cursor,
39 |                                      update_sql, 
40 |                                      url_categories,
41 |                                      template=None, 
42 |                                      page_size=100)
43 | 
44 | @Timer(name="fetch_unparsed_urls")
45 | def fetch_unparsed_urls(logger: logging.Logger, pg_dsn: str, limit: int) -> list[tuple]:
46 |   """return will be of the form [(url_id, url)]"""
47 |   fetch_sql = f"""
48 |     SELECT url_id, url
49 |     FROM k3l_url_labels 
50 |     WHERE parsed_ts IS NULL
51 |     ORDER BY earliest_cast_dt ASC
52 |     LIMIT {limit}
53 |   """
54 |   with psycopg2.connect(pg_dsn) as conn:
55 |     with conn.cursor() as cursor:
56 |       logger.info(f"Executing: {fetch_sql}")
57 |       cursor.execute(fetch_sql)
58 |       url_records = cursor.fetchall()
59 |       return url_records
60 | 
61 | @Timer(name="update_url_parts")
62 | def update_url_parts(logger: logging.Logger, pg_dsn: str, url_parts: list[tuple]):
63 |   """url_parts should be of the form [(url_id, scheme, domain, subdomain, tld, path)]"""
64 |   update_sql = f"""
65 |     UPDATE k3l_url_labels as k
66 |     SET parsed_ts=now(), scheme=v.scheme, domain=v.domain, subdomain=v.subdomain, tld=v.tld, path=v.path
67 |     FROM (VALUES %s) AS v(id, scheme, domain, subdomain, tld, path)
68 |     WHERE url_id=v.id;
69 |   """
70 |   with psycopg2.connect(pg_dsn) as conn:
71 |     with conn.cursor() as cursor:
72 |       logger.info(f"Executing: {update_sql}")
73 |       psycopg2.extras.execute_values(cursor,
74 |                                      update_sql, 
75 |                                      url_parts,
76 |                                      template=None, 
77 |                                      page_size=100)
78 | 
79 | 


--------------------------------------------------------------------------------
/pipeline/frames/incremental_load_cast_mapping.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO k3l_cast_embed_url_mapping(url_id, cast_id)
 2 | WITH max_cast_dt AS (
 3 |   select 
 4 |   	max(latest_cast_dt) as dt
 5 | 	from k3l_url_labels as labels
 6 | 	inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id)
 7 | )
 8 |   SELECT 
 9 |       labels.url_id as url_id,
10 |       casts.id as cast_id
11 |   FROM casts 
12 |     cross join lateral jsonb_array_elements(casts.embeds) as ems
13 |     inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL)
14 |      inner join 
15 |      	k3l_url_labels as labels 
16 |       	on (labels.url = ems->>'url'
17 |             AND jsonb_array_length(embeds) > 0
18 |     				AND ems->'url' IS NOT NULL
19 |     				AND ems->>'url' NOT LIKE ALL(ARRAY[
20 |                           'https://i.imgur.com/%',
21 |                           'https://youtu.be/%',
22 |                           'https://www.youtube.com/%',
23 |                           'https://imagedelivery.net/%',
24 |                           '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8'])  
25 |     				AND created_at >= max_cast_dt.dt
26 |             )


--------------------------------------------------------------------------------
/pipeline/frames/incremental_load_labels.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO k3l_url_labels(url, latest_cast_dt, earliest_cast_dt)
 2 | WITH max_cast_dt AS (
 3 |   select 
 4 |   	max(latest_cast_dt) as dt
 5 | 	from k3l_url_labels as labels
 6 | 	inner join k3l_cast_embed_url_mapping as url_map on (labels.url_id = url_map.url_id)
 7 | )
 8 | SELECT 
 9 |     ems->>'url' as url, 
10 |     max(created_at) as latest_cast_dt,
11 |    	min(created_at) as earliest_cast_dt
12 | FROM
13 | 		casts
14 |   		cross join lateral jsonb_array_elements(casts.embeds) as ems
15 |    inner join max_cast_dt on (casts.created_at >= max_cast_dt.dt AND casts.deleted_at IS NULL)
16 |    left join 
17 |    	k3l_url_labels as labels 
18 |     	on (labels.url = ems->>'url' 
19 |           and casts.created_at >= max_cast_dt.dt
20 |           )
21 | WHERE 
22 |   labels.url_id IS NULL
23 | 	AND jsonb_array_length(embeds) > 0
24 |   AND ems->'url' IS NOT NULL
25 |   AND ems->>'url' NOT LIKE ALL(ARRAY[
26 |                           'https://i.imgur.com/%',
27 |                           'https://youtu.be/%',
28 |                           'https://www.youtube.com/%',
29 |                           'https://imagedelivery.net/%',
30 |                           '%.png', '%.gif', '%.pdf', '%.jpg', '%.jpeg', '%.mp4', '%.m3u8'])
31 | GROUP BY ems->>'url'


--------------------------------------------------------------------------------
/pipeline/frames/scrape_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from enum import Enum
 3 | from typing import NamedTuple
 4 | import asyncio
 5 | from urllib.parse import urlparse
 6 | 
 7 | import tldextract
 8 | from bs4 import BeautifulSoup
 9 | import aiohttp as aiohttp
10 | 
11 | class URLCategory(Enum):
12 |   FRAME = 'frame'
13 |   TIMEOUT = 'timeout'
14 |   BAD = 'bad'
15 |   UNKNOWN = 'unknown'
16 |   ERROR = 'error'
17 | 
18 | async def categorize_url(
19 |     logger: logging.Logger,
20 |     url_id: int, url:str,
21 |     session: aiohttp.ClientSession,
22 |     timeout: aiohttp.ClientTimeout
23 | ) -> tuple[int, str]:
24 |   logger.debug(f"Fetching {url_id} - {url}")
25 |   try:
26 |     if urlparse(url).scheme not in ['http','https']:
27 |       logger.error(f"bad url {url_id} - {url}")
28 |       return (url_id, URLCategory.BAD.value)
29 |     async with session.get(url, timeout=timeout) as resp:
30 |       body = await resp.text()
31 |       soup = BeautifulSoup(body, 'html.parser')
32 |       frame_meta = soup.find('meta', attrs={"property":"fc:frame"})
33 |       return (url_id, URLCategory.FRAME.value) if frame_meta \
34 |                   else (url_id, URLCategory.UNKNOWN.value)
35 |   except asyncio.TimeoutError as e:
36 |     logger.error(f"{url_id} - {url} timed out: {e}")
37 |     return (url_id, URLCategory.TIMEOUT.value)
38 |   except aiohttp.InvalidURL as e:
39 |     logger.error(f"bad url {url_id} - {url}: {e}")
40 |     return (url_id, URLCategory.BAD.value)
41 |   except aiohttp.ClientError as e:
42 |     logger.error(f"error {url_id} - {url}: {e}")
43 |     return (url_id, URLCategory.ERROR.value)
44 |   except aiohttp.ClientError as e:
45 |     logger.error(f"error {url_id} - {url}: {e}")
46 |     return (url_id, URLCategory.ERROR.value)
47 |   except ValueError as e:
48 |     logger.error(f"error {url_id} - {url}: {e}")
49 |     return (url_id, URLCategory.ERROR.value)
50 |   except Exception as e:
51 |     logger.error(f"error {url_id} - {url}: {e}")
52 |     return (url_id, URLCategory.ERROR.value)
53 | 
54 | class URL_parts(NamedTuple):
55 |   url_id: int
56 |   scheme: str
57 |   domain: str
58 |   subdomain: str
59 |   tld: str
60 |   path: str
61 | 
62 | def parse_url(
63 |     logger: logging.Logger,
64 |     url_id: int,
65 |     url:str
66 | ) -> tuple[int, str, str, str, str, str]:
67 |   logger.debug(f"parsing {url_id} - {url}")
68 |   try:
69 |     parse_result = urlparse(url)
70 |     extract = tldextract.extract(url)
71 |     path = parse_result.path
72 |     if path.endswith(':'):
73 |       path = path[:-1]
74 |     return tuple(URL_parts(url_id,
75 |                     parse_result.scheme,
76 |                     extract.domain,
77 |                     extract.subdomain,
78 |                     extract.suffix,
79 |                     path))
80 |   except Exception as e:
81 |     logger.error(f"error {url_id} - {url}: {e}")
82 |     return (url_id, '', '', '', '', '')


--------------------------------------------------------------------------------
/pipeline/frames/test_urls.py:
--------------------------------------------------------------------------------
 1 | # standard dependencies
 2 | import sys
 3 | 
 4 | # local dependencies
 5 | from config import settings
 6 | from . import scrape_utils
 7 | 
 8 | # 3rd party dependencies
 9 | from dotenv import load_dotenv
10 | from loguru import logger
11 | 
12 | logger.remove()
13 | level_per_module = {
14 |     "": settings.LOG_LEVEL,
15 |     "silentlib": False
16 | }
17 | logger.add(sys.stdout, 
18 |            colorize=True, 
19 |            format=settings.LOGURU_FORMAT,
20 |            filter=level_per_module,
21 |            level=0)
22 | 
23 | def test():
24 |   url = 'https://apis.cast.k3l.io'
25 |   url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)   
26 |   logger.debug(f"{url} category ? {url_category}")
27 | 
28 |   url = 'https://cast.k3l.io/apis123'
29 |   url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)   
30 |   logger.debug(f"{url} category ? {url_category}")
31 |   
32 |   url = 'https://cast.k3l.io'
33 |   url_category = scrape_utils.categorize_url(logger, -1, url, timeout=1)   
34 |   logger.debug(f"{url} category ? {url_category}")
35 | 
36 |   url = 'https://dune-frames.vercel.app/api'
37 |   url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)   
38 |   logger.debug(f"{url} category ? {url_category}")
39 | 
40 |   url = 'https://www.youtube.com'
41 |   url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)   
42 |   logger.debug(f"{url} category ? {url_category}")
43 | 
44 |   url = 'https://www.youttube.com'
45 |   url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)   
46 |   logger.debug(f"{url} category ? {url_category}")
47 | 
48 |   url = 'abc'
49 |   url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)   
50 |   logger.debug(f"{url} category ? {url_category}")
51 | 
52 |   url = 'http://1'
53 |   url_category = scrape_utils.categorize_url(logger, -1, url, settings.FRAMES_SCRAPE_TIMEOUT_SECS)   
54 |   logger.debug(f"{url} category ? {url_category}")
55 | 
56 | 
57 | if __name__ == "__main__":
58 |   load_dotenv()
59 |   print(settings)
60 | 
61 |   logger.debug('####### TODO use pytest ########')
62 |   test()
63 | 


--------------------------------------------------------------------------------
/pipeline/globaltrust/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/globaltrust/__init__.py


--------------------------------------------------------------------------------
/pipeline/globaltrust/export_localtrust_daily_stats.sql:
--------------------------------------------------------------------------------
 1 | with stats_per_strategy_per_date as (SELECT
 2 |   max(date) AS date,
 3 |   COUNT(CASE WHEN strategy_id = 1 THEN 1 END) AS strategy_id_1_row_count,
 4 |   AVG(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_mean,
 5 |   STDDEV(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_stddev,
 6 |   MAX(CASE WHEN strategy_id = 1 THEN v END) - MIN(CASE WHEN strategy_id = 1 THEN v END) AS strategy_id_1_range,
 7 |   COUNT(CASE WHEN strategy_id = 3 THEN 1 END) AS strategy_id_3_row_count,
 8 |   AVG(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_mean,
 9 |   STDDEV(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_stddev,
10 |   MAX(CASE WHEN strategy_id = 3 THEN v END) - MIN(CASE WHEN strategy_id = 3 THEN v END) AS strategy_id_3_range
11 | FROM
12 |   localtrust
13 | -- GROUP BY
14 | --   date
15 | )
16 |   
17 | INSERT INTO localtrust_stats (
18 |     date,
19 |     strategy_id_1_row_count,
20 |     strategy_id_1_mean,
21 |     strategy_id_1_stddev,
22 |     strategy_id_1_range,
23 |     strategy_id_3_row_count,
24 |     strategy_id_3_mean,
25 |     strategy_id_3_stddev,
26 |     strategy_id_3_range
27 | )
28 | SELECT
29 |     date,
30 |     strategy_id_1_row_count,
31 |     strategy_id_1_mean,
32 |     strategy_id_1_stddev,
33 |     strategy_id_1_range,
34 |     strategy_id_3_row_count,
35 |     strategy_id_3_mean,
36 |     strategy_id_3_stddev,
37 |     strategy_id_3_range
38 | FROM
39 |     stats_per_strategy_per_date;
40 | 


--------------------------------------------------------------------------------
/pipeline/globaltrust/queries.py:
--------------------------------------------------------------------------------
 1 | from db_utils import SQL
 2 | 
 3 | class IJVSql:
 4 |   LIKES = SQL("LIKES", """
 5 |     SELECT reactions.fid as i, reactions.target_fid as j, count(1) as likes_v 
 6 |     FROM reactions 
 7 |     INNER JOIN fids ON fids.fid = reactions.target_fid
 8 |     WHERE reaction_type=1
 9 |     AND reactions.target_fid IS NOT NULL
10 |     {condition}
11 |     GROUP BY i, j
12 |     """)
13 |   REPLIES = SQL("REPLIES", """
14 |     SELECT fid as i, parent_fid as j, count(1) as replies_v 
15 |     FROM casts
16 |     WHERE parent_hash IS NOT NULL
17 |     {condition}
18 |     GROUP by i, j
19 |     """)
20 |   MENTIONS = SQL("MENTIONS", """
21 |     WITH mention AS (
22 | 			SELECT fid as author_fid, mention as mention_fid, timestamp
23 | 			FROM casts, unnest(casts.mentions) as mention
24 | 		)
25 | 		SELECT 
26 | 			author_fid as i, mention_fid as j, count(1) as mentions_v
27 | 		FROM mention
28 |     INNER JOIN fids ON fids.fid = mention.mention_fid
29 |     {condition}
30 | 		GROUP BY i, j
31 |     """)
32 |   RECASTS = SQL("RECASTS", """
33 |     SELECT reactions.fid as i, reactions.target_fid as j, count(1) as recasts_v 
34 |     FROM reactions 
35 |     INNER JOIN fids ON fids.fid = reactions.target_fid
36 |     WHERE reaction_type=2
37 |     AND reactions.target_fid IS NOT NULL
38 |     {condition}
39 |     GROUP BY i, j
40 |     """)
41 |   FOLLOWS = SQL("FOLLOWS", """
42 |     SELECT 
43 |         links.fid as i, 
44 |         links.target_fid as j,
45 |         1 as follows_v
46 |     FROM links 
47 |     INNER JOIN fids ON fids.fid = links.target_fid
48 |     WHERE type = 'follow'::text
49 |     {condition}
50 |     ORDER BY i, j, follows_v desc
51 |     """)
52 |   
53 | class IVSql:
54 |   PRETRUST_TOP_TIER = SQL("PRETRUST_TOP_TIER", """
55 |     WITH pt_size AS (
56 |       select count(*) as ct from pretrust_v2 
57 |       where insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy})
58 |       and strategy_id = {strategy}
59 |     ) 
60 |     SELECT fid as i, 1/ct::numeric as v
61 |     FROM pretrust_v2, pt_size
62 |     WHERE insert_ts=(select max(insert_ts) from pretrust_v2 where strategy_id = {strategy})
63 |     AND strategy_id = {strategy}
64 |     """)
65 |   PRETRUST_POPULAR = SQL("PRETRUST_POPULAR", """
66 |     SELECT
67 | 			c.fid AS i, 
68 |       1/20::numeric as v
69 | 		FROM
70 | 			reactions r
71 | 			INNER JOIN casts c ON c.hash = r.target_cast_hash
72 | 			INNER JOIN user_data u ON c.fid = u.fid AND u.type = 6
73 | 		WHERE
74 | 			r.created_at >= current_timestamp - interval '7' day
75 | 		GROUP BY
76 | 			c.fid
77 | 		ORDER BY
78 | 			COUNT(*) DESC
79 | 		LIMIT 20
80 |     """)
81 |   PRETRUST_OG = SQL("PRETRUST_OG", """
82 |     SELECT 
83 | 			distinct fid as i,
84 |       1/11::numeric as v
85 | 		FROM user_data 
86 | 		WHERE 
87 | 			value in ('dwr.eth', 'varunsrin.eth', 'balajis.eth', 
88 |     				  'vitalik.eth','ccarella.eth','tim',
89 | 					  'lesgreys.eth','linda','ace',
90 | 					  'vm','cdixon.eth')
91 | 			AND type=6
92 |     """)


--------------------------------------------------------------------------------
/pipeline/globaltrust/test_data.py:
--------------------------------------------------------------------------------
 1 | # standard dependencies
 2 | import logging
 3 | 
 4 | # local dependencies
 5 | import utils
 6 | from config import settings
 7 | from . import compute
 8 | from .queries import IJVSql
 9 | 
10 | # 3rd party dependencies
11 | from dotenv import load_dotenv
12 | import pandas as pd
13 | 
14 | if __name__ == '__main__':
15 |   load_dotenv()
16 |   print(settings)
17 | 
18 |   logger = logging.getLogger()
19 |   utils.setup_filelogger(logger, __file__)
20 |   logger.setLevel(logging.DEBUG)
21 |   utils.setup_consolelogger(logger)
22 | 
23 |   pg_dsn = settings.ALT_POSTGRES_DSN.get_secret_value()
24 | 
25 |   df = compute._fetch_interactions_df(logger, pg_dsn)
26 |   logger.info(utils.df_info_to_string(df, with_sample=True))
27 | 
28 |   pkl_file = '/tmp/fc_interactions_df.pkl'
29 |   logger.info(f"Pickling interactions dataframe to {pkl_file}")
30 |   df.to_pickle(pkl_file)
31 |   logger.info(f"Done pickling interactions dataframe  to {pkl_file}")
32 | 
33 |   num_ij_pairs = df[df['follows_v'].notna()].groupby(['i', 'j']).ngroups
34 |   logger.info(f"Unique i,j follow pairs: {num_ij_pairs}")
35 | 
36 |   num_selfies = len(df[df['i']==df['j']])
37 |   logger.info(f"Number of self followers: {num_selfies}")
38 |   


--------------------------------------------------------------------------------
/pipeline/graph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/graph/__init__.py


--------------------------------------------------------------------------------
/pipeline/graph/export_existingConnections_addr.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 | 	'0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i,
 3 |  	'0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j, 
 4 |   lt.v
 5 | FROM localtrust as lt
 6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8))
 7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8))
 8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid)
 9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid)
10 | WHERE 
11 |   lt.strategy_id=1
12 |   AND lt.date=(select max(date) from localtrust where strategy_id=1)
13 | 
14 | 


--------------------------------------------------------------------------------
/pipeline/graph/export_existingConnections_fid.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 | i,
 3 | j,
 4 | v
 5 | from 
 6 | localtrust
 7 | where 
 8 | strategy_id=1
 9 | and date=(select max(date) from localtrust where strategy_id=1)
10 | -- comment out below code for local testing
11 | -- AND i::integer < 10
12 | -- ORDER BY random()
13 | -- LIMIT 1000


--------------------------------------------------------------------------------
/pipeline/graph/export_l1rep6rec3m12enhancedConnections_addr.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 | 	'0x'||encode(coalesce(v1.signer_address, f1.custody_address),'hex') as i,
 3 |  	'0x'||encode(coalesce(v2.signer_address, f2.custody_address),'hex') as j, 
 4 |   lt.v
 5 | FROM localtrust as lt
 6 | INNER JOIN fids as f1 on (f1.fid = cast(lt.i as int8))
 7 | INNER JOIN fids as f2 on (f2.fid = cast(lt.j as int8))
 8 | LEFT JOIN verifications as v1 on (v1.fid = f1.fid)
 9 | LEFT JOIN verifications as v2 on (v2.fid = f2.fid)
10 | WHERE 
11 |   lt.strategy_id=3
12 |   AND lt.date=(select max(date) from localtrust where strategy_id=3)


--------------------------------------------------------------------------------
/pipeline/graph/export_l1rep6rec3m12enhancedConnections_fid.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 | i,
 3 | j,
 4 | v
 5 | from 
 6 | localtrust
 7 | where 
 8 | strategy_id=3
 9 | and date=(select max(date) from localtrust where strategy_id=3)
10 | -- comment out below code for local testing
11 | -- AND i::integer < 10
12 | -- ORDER BY random()
13 | -- LIMIT 1000


--------------------------------------------------------------------------------
/pipeline/graph/rechunk_graph_pqt.py:
--------------------------------------------------------------------------------
 1 | # standard dependencies
 2 | from pathlib import Path
 3 | import argparse
 4 | import sys
 5 | import os
 6 | 
 7 | # local dependencies
 8 | 
 9 | # 3rd party dependencies
10 | from loguru import logger
11 | import polars as pl
12 | 
13 | def main(indir: Path, outfile: Path):
14 | 
15 |     logger.info(f"reading parquet files {indir}/*.pqt")
16 |     pq_files = [os.path.join(indir, f) for f in os.listdir(indir) if f.endswith('.pqt')]
17 |     if not pq_files:
18 |         raise FileNotFoundError(f"No parquet files found in {indir}")
19 | 
20 |     # Read all parquet files into a list of DataFrames
21 |     dfs = []
22 |     for file in pq_files:
23 |         try:
24 |             df = pl.read_parquet(file, rechunk=True, low_memory=False)
25 |             dfs.append(df)
26 |             logger.debug(f"Successfully read {file}")
27 |         except Exception as e:
28 |             logger.error(f"Error reading {file}: {e}")
29 | 
30 |     if not dfs:
31 |         raise ValueError("No valid parquet files could be read")
32 | 
33 |     # Concatenate all DataFrames into a single DataFrame
34 |     pq_df = pl.concat(dfs)
35 | 
36 |     logger.info(f"df estimated_size: {pq_df.estimated_size('mb')}")
37 |     logger.info(f"df describe: {pq_df.describe()}")
38 |     logger.info(f"df sample: {pq_df.sample(n=min(5, len(pq_df)))}")
39 | 
40 |     logger.info(f"writing to parquet file {outfile}")
41 |     pq_df.write_parquet(outfile,
42 |                         use_pyarrow=True,
43 |                         statistics=True,
44 |                         pyarrow_options={
45 |                           "write_statistics": True,
46 |                           "row_group_size": 100_000})
47 | 
48 | if __name__ == '__main__':
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("-i", "--indir",
51 |                         help="input directory with all pqt files",
52 |                         required=True,
53 |                         type=lambda f: Path(f).expanduser().resolve())
54 |     parser.add_argument("-o", "--outfile",
55 |                         help="output filename",
56 |                         required=True,
57 |                         type=lambda f: Path(f).expanduser().resolve())
58 | 
59 |     args = parser.parse_args()
60 |     print(args)
61 | 
62 |     logger.remove()
63 |     logger.add(sys.stderr, level='INFO')
64 | 
65 |     if os.path.isdir(args.outfile):
66 |         logger.error("-o / --outfile should be a file not a directory")
67 |         sys.exit(1)
68 |     main(args.indir, args.outfile)
69 | 


--------------------------------------------------------------------------------
/pipeline/igraph-docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   igraph:
 3 |     build:
 4 |       dockerfile: igraph.Dockerfile
 5 |     deploy:
 6 |       mode: replicated
 7 |       replicas: 2
 8 |     image: igraph:latest
 9 |     volumes:
10 |       - /home/ubuntu/serve_files:/home/ubuntu/serve_files:z
11 |     expose:
12 |       - '8000'
13 |     restart: "on-failure"
14 |     networks:
15 |       - farcaster-network
16 |   nginx:
17 |     image: nginx:latest
18 |     volumes:
19 |       - ./igraph.nginx.conf:/etc/nginx/nginx.conf:ro
20 |     depends_on:
21 |       - igraph
22 |     ports:
23 |       - "4000:4000"
24 |     networks:
25 |       - farcaster-network
26 | 
27 | networks:
28 |   farcaster-network:
29 |     name: farcaster-network
30 |     external: true
31 | 


--------------------------------------------------------------------------------
/pipeline/igraph.Dockerfile:
--------------------------------------------------------------------------------
 1 | # FROM python:3.12-alpine
 2 | # not taking the alpine route because packages like psutil don't install without gcc
 3 | FROM python:3.12-slim
 4 | 
 5 | RUN pip install --upgrade pip
 6 | 
 7 | WORKDIR /server
 8 | 
 9 | # don't copy code yet otherwise docker layers will get invalidated every code push
10 | COPY ./requirements.txt /server
11 | 
12 | RUN python -m ensurepip --upgrade
13 | RUN python -m pip install --no-cache-dir --upgrade -r requirements.txt
14 | 
15 | # copy rest of the code
16 | COPY . /server
17 | 
18 | CMD ["uvicorn", "graph.serve_igraph:app", "--host", "0.0.0.0", "--port", "8000", "--timeout-keep-alive", "300"]


--------------------------------------------------------------------------------
/pipeline/igraph.nginx.conf:
--------------------------------------------------------------------------------
 1 | user nginx;
 2 | worker_processes auto;
 3 | worker_rlimit_nofile 30000;
 4 | 
 5 | events {
 6 |     worker_connections 4096;
 7 | }
 8 | 
 9 | http {
10 |     keepalive_timeout 65;
11 |     keepalive_requests 100000;
12 |     tcp_nopush on;
13 |     tcp_nodelay on;
14 | 
15 |     upstream igraph_servers {
16 |         server igraph:8000;
17 |     }
18 | 
19 |     server {
20 |         listen 4000;
21 | 
22 |         location / {
23 |             proxy_pass http://igraph_servers;
24 |             proxy_connect_timeout 300s;
25 |             proxy_send_timeout 300s;
26 |             proxy_read_timeout 300s;
27 |         }
28 |     }
29 | }


--------------------------------------------------------------------------------
/pipeline/logs/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/logs/.placeholder


--------------------------------------------------------------------------------
/pipeline/plugins/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/.placeholder


--------------------------------------------------------------------------------
/pipeline/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/__init__.py


--------------------------------------------------------------------------------
/pipeline/plugins/hooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/plugins/hooks/__init__.py


--------------------------------------------------------------------------------
/pipeline/plugins/hooks/common.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse
 2 | from airflow.models import Variable
 3 | 
 4 | def convert_hostname(url: str):
 5 |   # Parse the original URL
 6 | 	parsed_url = urlparse(url)
 7 | 
 8 | 	# Replace the scheme and netloc with the new hostname
 9 | 	new_netloc = Variable.get("airflow_hostname")
10 | 	new_scheme = "https"
11 | 
12 | 	# Construct the new URL
13 | 	return urlunparse((new_scheme, new_netloc) + parsed_url[2:])


--------------------------------------------------------------------------------
/pipeline/plugins/hooks/discord.py:
--------------------------------------------------------------------------------
 1 | # copied from https://medium.com/@artur.aacs/airflow-send-alerts-with-discord-69f343dfa8dd
 2 | import re
 3 | from typing import Optional
 4 | from datetime import datetime
 5 | 
 6 | from airflow.models import Variable, TaskInstance
 7 | from discord_webhook import DiscordWebhook, DiscordEmbed
 8 | from hooks.common import convert_hostname
 9 | 
10 | TI = TaskInstance
11 | 
12 | def send_alert_discord(context):
13 | 	# Get Task Instances variables
14 | 	last_task: Optional[TaskInstance] = context.get('task_instance')
15 | 	task_name = last_task.task_id
16 | 	dag_name = last_task.dag_id
17 | 	log_link = convert_hostname(last_task.log_url)
18 | 	execution_date = datetime.fromisoformat(str(context.get('execution_date')))
19 | 
20 | 	# Extract reason for the exception
21 | 	# try:
22 | 	# 	error_message = str(context["exception"])
23 | 	# 	error_message = error_message[:1000] + (error_message[1000:] and '...')
24 | 	# 	str_start = re.escape("{'reason': ")
25 | 	# 	str_end = re.escape('"}.')
26 | 	# 	error_message = re.search('%s(.*)%s' % (str_start, str_end), error_message).group(1)
27 | 	# 	error_message = "{'reason': " + error_message + ',}'
28 | 	# except:
29 | 	# 	error_message = "Some error that cannot be extracted has occurred. Visit the logs!"
30 | 
31 | 	print('Sending discord alert')
32 | 
33 | 	# Send Alert
34 | 	webhook = DiscordWebhook(url=Variable.get("discord_webhook")) # Update variable name with your change
35 | 	print('execution_date', execution_date)
36 | 	embed = DiscordEmbed(title="Airflow Alert - Task has failed!", color='CC0000', url=log_link, timestamp=execution_date)
37 | 	embed.add_embed_field(name="DAG", value=dag_name, inline=True)
38 | 	embed.add_embed_field(name="PRIORITY", value="HIGH", inline=True)
39 | 	embed.add_embed_field(name="TASK", value=task_name, inline=False)
40 | 	embed.add_embed_field(name="ERROR", value=str(context["exception"]))
41 | 	webhook.add_embed(embed)
42 | 	response = webhook.execute()
43 | 
44 | 	return response


--------------------------------------------------------------------------------
/pipeline/plugins/hooks/pagerduty.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from airflow.providers.pagerduty.notifications.pagerduty import send_pagerduty_notification
 4 | from airflow.providers.pagerduty.hooks.pagerduty_events import PagerdutyEventsHook
 5 | from airflow.providers.pagerduty.hooks.pagerduty import PagerdutyHook
 6 | 
 7 | from hooks.common import convert_hostname
 8 | from airflow.models import Variable, TaskInstance
 9 | 
10 | # refer to https://github.com/astronomer/pagerduty_airflow_integration_benefits/blob/main/README.md
11 | def send_alert_pagerduty(context):
12 |   # Get Task Instances variables
13 |   last_task: Optional[TaskInstance] = context.get('task_instance')
14 |   log_link = convert_hostname(last_task.log_url)
15 |   print('log_link', log_link)
16 | 
17 |   task_id = last_task.task_id
18 |   dag_id = last_task.dag_id
19 |   # pagerduty_default needs to be saved on Admin->Variable on the console with Pagerduty Events
20 |   integration_key=Variable.get("pagerduty_default")
21 | 
22 |   print('Sending pagerduty alert')
23 |   return PagerdutyEventsHook(integration_key).send_event(
24 |       summary=f"Airflow Alert - {dag_id}-{task_id} failed",
25 |       severity="critical",
26 |       source=f"airflow dag_id: {dag_id}",
27 |       dedup_key=f"{dag_id}-{task_id}",
28 |       group=f"{dag_id}",
29 |       component="airflow",
30 |       class_type="Prod Data Pipeline",
31 |       custom_details=str(context["exception"]),
32 |       links=[{
33 |         'href': log_link,
34 |         'text': 'Link to errored task log'
35 |       }],
36 |   )


--------------------------------------------------------------------------------
/pipeline/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==2.1.3
 2 | python-dotenv==1.0.0
 3 | igraph==0.11.3
 4 | psutil==5.9.8
 5 | psycopg2-binary==2.9.9
 6 | pydantic-settings==2.2.1
 7 | sqlalchemy==1.4.52
 8 | requests==2.31.0
 9 | loguru==0.7.2
10 | beautifulsoup4==4.12.3
11 | aiohttp==3.9.3
12 | tldextract==5.1.1
13 | niquests==3.5.5
14 | polars==0.20.27
15 | pyarrow==16.1.0
16 | fastapi==0.111.0
17 | apache-airflow==2.9.2
18 | dune-client==1.7.4
19 | openrank-sdk==0.2.2
20 | apache-airflow-providers-ssh==3.12.0
21 | asyncpg==0.29.0
22 | tomlkit==0.13.2


--------------------------------------------------------------------------------
/pipeline/run_cast_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DT_FORMAT='%Y-%m-%d %H:%M:%S'
 4 | 
 5 | # Function to validate date format
 6 | function validate_date() {
 7 |     date_to_check="$1"
 8 |     date_format="$2"
 9 | 
10 |     # Check if the date matches the format YYYY-mm-dd
11 |     if [[ $(uname) == "Darwin" ]]; then
12 |       if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then
13 |         echo "Invalid date format. Use YYYY-mm-dd."
14 |         exit 1
15 |       fi
16 |     else
17 |       if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then
18 |         echo "Invalid date format. Use YYYY-mm-dd."
19 |         exit 1
20 |       fi
21 |     fi
22 | 
23 |     # Check if the date is in the past
24 |     today=$(date +"$date_format")
25 |     if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then
26 |       echo "The date must be in the past and not include today."
27 |       exit 1
28 |     fi
29 | }
30 | 
31 | while getopts dv:f:t:p:m: flag
32 | do
33 |     case "${flag}" in
34 |         d) DAEMON_FLAG="--daemon";;
35 |         v) VENV=${OPTARG};;
36 |         f) FILL_TYPE=${OPTARG};;
37 |         t) TARGET_DATE=${OPTARG};;
38 |         m) TARGET_MONTH=${OPTARG};;
39 |         p) POSTGRES=${OPTARG};;
40 |     esac
41 | done
42 | 
43 | if [ -z "$VENV" ]; then
44 |   echo "Usage:   $0 -v [venv]  -p [postgres] -d -t [fill_type]"
45 |   echo ""
46 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
47 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/  -p eigen8 -d -t backfill"
48 |   echo ""
49 |   echo "Params:"
50 |   echo "  [venv] The path where a python3 virtualenv has been created."
51 |   echo "  [postgres] 'eigen2' or 'eigen8'"
52 |   echo "  [daemon] Run in daemon mode."
53 |   echo "  [fill_type] Run in 'default' or 'backfill' or 'gapfill' mode."
54 |   echo ""
55 |   exit
56 | fi
57 | 
58 | if [ ! -z "$POSTGRES" ]; then
59 |   PG_OPTION="--postgres $POSTGRES"
60 | fi
61 | 
62 | FILL_TYPE=${FILL_TYPE:-default}
63 | 
64 | if [ ! -z "$TARGET_DATE" ]; then
65 |   validate_date "$TARGET_DATE" "$DT_FORMAT"
66 |   DATE_OPTION=(--target-date "$TARGET_DATE")
67 | fi
68 | 
69 | # validating TARGET_MONTH in bash is a bit of a pain
70 | # ... let the python script validate it
71 | if [ ! -z "$TARGET_MONTH" ]; then
72 |   MONTH_OPTION="--target-month $TARGET_MONTH"
73 | fi
74 | 
75 | 
76 | # set -x
77 | set -e
78 | set -o pipefail
79 | 
80 | function log() {
81 |   echo "`date` - $1"
82 | }
83 | 
84 | source $VENV/bin/activate
85 | # pip install -r requirements.txt
86 | python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}" $MONTH_OPTION
87 | deactivate
88 | 
89 | log "Done"


--------------------------------------------------------------------------------
/pipeline/run_channel_metrics.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts "w:v:rd" flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         r) RUN_FLAG="--run";;
 9 |         d) DRYRUN_FLAG="--dry-run";;
10 |     esac
11 | done
12 | 
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then
14 |   echo "Usage:   $0 -w [work_dir] -v [venv] -r -d"
15 |   echo ""
16 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r"
17 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
21 |   echo "  [venv] The path where a python3 virtualenv has been created."
22 |   echo "  [run] Flag to run the script."
23 |   echo "  [dryrun] Flag to run the script in dry-run mode."
24 |   echo ""
25 |   exit
26 | fi
27 | 
28 | source $WORK_DIR/.env
29 | 
30 | # set -x
31 | set -e
32 | set -o pipefail
33 | 
34 | function log() {
35 |   echo "`date` - $1"
36 | }
37 | 
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_metrics $RUN_FLAG $DRYRUN_FLAG 
41 | deactivate
42 | 


--------------------------------------------------------------------------------
/pipeline/run_download_pqt_files_v1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # dayOfYear=`date '+%j'`
 4 | # hourOfDay=`date '+%H'`
 5 | # hourOfYear="$((dayOfYear * 24 + hourOfDay))"
 6 | # echo $dayOfYear $hourOfDay $hourOfYear
 7 | # hour_interval=48
 8 | 
 9 | # # TODO use the mtime of the existing parquet file and
10 | # # ..if current time - mtime > 1 hour, start compute
11 | # if [ `expr $hourOfYear % $hour_interval` -eq 0 ]; then
12 | #    echo "This is hour $hour_interval. Continuing with script."
13 | # else
14 | #    echo "This not hour $hour_interval. Exiting now."
15 | #    exit 0
16 | # fi
17 | 
18 | 
19 | while getopts o:s: flag
20 | do
21 |     case "${flag}" in
22 |         o) OUT_DIR=${OPTARG};;
23 |         s) S3_BKT=${OPTARG};;
24 |     esac
25 | done
26 | 
27 | if [ -z "$OUT_DIR" ] || [ -z "$S3_BKT" ]; then
28 |   echo "Usage:   $0  -o [out_dir] -s [s3_bkt]"
29 |   echo ""
30 |   echo "Example: $0 \ "
31 |   echo "  -i /home/ubuntu/serve_files/lt_engagement_fid.csv \ "
32 |   echo "  -w . \ "
33 |   echo "  -v .venv \ "
34 |   echo "  -o /tmp/personal-graph/ \ "
35 |   echo "  -s k3l-openrank-farcaster \ "
36 |   echo ""
37 |   echo "Params:"
38 |   echo "  [in_csv]  The source file to read dataframe from."
39 |   echo "  [out_dir] The output directory to write the graph file."
40 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
41 |   echo "  [venv] The path where a python3 virtualenv has been created."
42 |   echo "  [s3_bkt] The S3 bucket to upload the graph file to."
43 |   echo "  [task] task to run. choose one: graph_reload, generate, fetch_fids, consolidate"
44 |   echo "  [fids] comma separated fids to run '1,2,3,420,69'"
45 |   echo "  [run_id] airflow run id. eg) 'manual__2024-07-22T06:46:15.813325+00:00' "
46 |   echo "  [map_index] airflow map index"
47 |   echo ""
48 |   exit
49 | fi
50 | 
51 | source $WORK_DIR/.env
52 | 
53 | set -x
54 | set -e
55 | set -o pipefail
56 | 
57 | aws s3 cp s3://${S3_BKT}/personal_graph.parquet $OUT_DIR/personal_graph.parquet


--------------------------------------------------------------------------------
/pipeline/run_eigen2_postgres_sql.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | while getopts w: flag
 3 | do
 4 |     case "${flag}" in
 5 |         w) WORK_DIR=${OPTARG};;
 6 |     esac
 7 | done
 8 | 
 9 | shift $((OPTIND-1))
10 | SQL_STATEMENT="$1"
11 | 
12 | if [ -z "$WORK_DIR" ]; then
13 |   echo "Usage:   $0 -w [work_dir] [sql_statement]"
14 |   echo ""
15 |   echo "Example: $0 -w .  -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'"
16 |   echo ""
17 |   echo "Params:"
18 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
19 |   echo "  [sql_statement] Optional sql statement to execute."
20 |   echo ""
21 |   exit 1
22 | fi
23 | 
24 | source $WORK_DIR/.env
25 | 
26 | DB_HOST=${DB_HOST:-127.0.0.1}
27 | DB_PORT=${DB_PORT:-5432}
28 | DB_USER=${DB_USER:-replicator}
29 | DB_NAME=${DB_NAME:-replicator}
30 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
31 | 
32 | # set -x
33 | set -e
34 | set -o pipefail
35 | 
36 | if hash psql 2>/dev/null; then
37 |   echo "OK, you have psql in the path. We’ll use that."
38 |   PSQL=psql
39 | else
40 |   echo "You don't have psql is the path. Let's try /usr/bin"
41 |   hash /usr/bin/psql
42 |   PSQL=/usr/bin/psql
43 | fi
44 | 
45 | PGPASSWORD=$DB_PASSWORD $PSQL -e -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
46 |   -c "$SQL_STATEMENT"


--------------------------------------------------------------------------------
/pipeline/run_eigen8_postgres_sql.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | while getopts w: flag
 3 | do
 4 |     case "${flag}" in
 5 |         w) WORK_DIR=${OPTARG};;
 6 |     esac
 7 | done
 8 | 
 9 | shift $((OPTIND-1))
10 | SQL_STATEMENT="$1"
11 | 
12 | if [ -z "$WORK_DIR" ]; then
13 |   echo "Usage:   $0 -w [work_dir] [sql_statement]"
14 |   echo ""
15 |   echo "Example: $0 -w .  -c 'REFRESH MATERIALIZED VIEW CONCURRENTLY k3l_rank;'"
16 |   echo ""
17 |   echo "Params:"
18 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
19 |   echo "  [sql_statement] Optional sql statement to execute."
20 |   echo ""
21 |   exit 1
22 | fi
23 | 
24 | source $WORK_DIR/.env
25 | 
26 | ALT_REMOTE_DB_HOST=${ALT_REMOTE_DB_HOST:-127.0.0.1}
27 | ALT_REMOTE_DB_PORT=${ALT_REMOTE_DB_PORT:-5432}
28 | ALT_REMOTE_DB_USER=${ALT_REMOTE_DB_USER:-k3l_user}
29 | ALT_REMOTE_DB_NAME=${ALT_REMOTE_DB_NAME:-farcaster}
30 | ALT_REMOTE_DB_PASSWORD=${ALT_REMOTE_DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
31 | 
32 | # set -x
33 | set -e
34 | set -o pipefail
35 | 
36 | if hash psql 2>/dev/null; then
37 |   echo "OK, you have psql in the path. We’ll use that."
38 |   PSQL=psql
39 | else
40 |   echo "You don't have psql is the path. Let's try /usr/bin"
41 |   hash /usr/bin/psql
42 |   PSQL=/usr/bin/psql
43 | fi
44 | 
45 | PGPASSWORD=$ALT_REMOTE_DB_PASSWORD $PSQL -e -h $ALT_REMOTE_DB_HOST \
46 |   -p $ALT_REMOTE_DB_PORT -U $ALT_REMOTE_DB_USER -d $ALT_REMOTE_DB_NAME \
47 |   -c "$SQL_STATEMENT"


--------------------------------------------------------------------------------
/pipeline/run_fetch_channel_top_caster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w:i:v:c: flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         c) CSV_PATH=${OPTARG};;
 9 |     esac
10 | done
11 | 
12 | shift $((OPTIND-1))
13 | CHANNEL_IDS="$1"
14 | 
15 | if [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then
16 |   echo "Usage:   $0 -w [work_dir] -v [venv] -c [csv_path] [channel_ids]"
17 |   echo ""
18 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Top_Channels.csv"
19 |   echo ""
20 |   echo "Params:"
21 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
22 |   echo "  [venv] The path where a python3 virtualenv has been created."
23 |   echo "  [csv_path] The path to the CSV file."
24 |   echo ""
25 |   exit 1
26 | fi
27 | 
28 | log() {
29 |   echo "`date` - $1"
30 | }
31 | 
32 | log "Starting script with parameters: WORK_DIR=${WORK_DIR}, VENV=${VENV}, CSV_PATH=${CSV_PATH}"
33 | 
34 | source $WORK_DIR/.env
35 | 
36 | set -e
37 | set -o pipefail
38 | 
39 | function log() {
40 |   echo "`date` - $1"
41 | }
42 | 
43 | log "Activating virtual environment"
44 | source $VENV/bin/activate
45 | # pip install -r requirements.txt
46 | log "Executing task"
47 | python3 -m channels.main_fetch_channel_top_casters -c "$CSV_PATH"
48 | deactivate
49 | 
50 | 


--------------------------------------------------------------------------------
/pipeline/run_fetch_top_caster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts v:s: flag
 4 | do
 5 |     case "${flag}" in
 6 |         v) VENV=${OPTARG};;
 7 |     esac
 8 | done
 9 | 
10 | if [ -z "$VENV" ]; then
11 |   echo "Usage:   $0 -v [venv]"
12 |   echo ""
13 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
14 |   echo ""
15 |   echo "Params:"
16 |   echo "  [venv] The path where a python3 virtualenv has been created."
17 |   echo ""
18 |   exit
19 | fi
20 | 
21 | # set -x
22 | set -e
23 | set -o pipefail
24 | 
25 | function log() {
26 |   echo "`date` - $1"
27 | }
28 | 
29 | source $VENV/bin/activate
30 | # pip install -r requirements.txt
31 | python3 -m casts.main_fetch_top_casters
32 | deactivate
33 | 


--------------------------------------------------------------------------------
/pipeline/run_fetch_top_spammers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts v:s: flag
 4 | do
 5 |     case "${flag}" in
 6 |         v) VENV=${OPTARG};;
 7 |     esac
 8 | done
 9 | 
10 | if [ -z "$VENV" ] ; then
11 |   echo "Usage:   $0 -v [venv]"
12 |   echo ""
13 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
14 |   echo ""
15 |   echo "Params:"
16 |   echo "  [venv] The path where a python3 virtualenv has been created."
17 |   echo ""
18 |   exit
19 | fi
20 | 
21 | # set -x
22 | set -e
23 | set -o pipefail
24 | 
25 | function log() {
26 |   echo "`date` - $1"
27 | }
28 | 
29 | source $VENV/bin/activate
30 | # pip install -r requirements.txt
31 | python3 -m casts.main_fetch_top_spammers
32 | deactivate
33 | 


--------------------------------------------------------------------------------
/pipeline/run_frame_scraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts d:v: flag
 4 | do
 5 |     case "${flag}" in
 6 |         d) DAEMON=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |     esac
 9 | done
10 | 
11 | if [ -z "$VENV" ]; then
12 |   echo "Usage:   $0 -v [venv]"
13 |   echo ""
14 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
15 |   echo ""
16 |   echo "Params:"
17 |   echo "  [venv] The path where a python3 virtualenv has been created."
18 |   echo ""
19 |   exit
20 | fi
21 | 
22 | # set -x
23 | set -e
24 | set -o pipefail
25 | 
26 | function log() {
27 |   echo "`date` - $1"
28 | }
29 | 
30 | DAEMON=${DAEMON:-false}
31 | 
32 | source $VENV/bin/activate
33 | # pip install -r requirements.txt
34 | mkdir -p tmp/tldcache
35 | export TLDEXTRACT_CACHE=tmp/tldcache
36 | python3 -m frames.main -d $DAEMON
37 | deactivate
38 | 


--------------------------------------------------------------------------------
/pipeline/run_graph_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts i:o:p:w:v: flag
 4 | do
 5 |     case "${flag}" in
 6 |         i) IN_FILE=${OPTARG};;
 7 |         o) OUT_DIR=${OPTARG};;
 8 |         p) OUT_PREFIX=${OPTARG};;
 9 |         w) WORK_DIR=${OPTARG};;
10 |         v) VENV=${OPTARG};;
11 |     esac
12 | done
13 | 
14 | if [ -z "$IN_FILE" ] || [ -z "$OUT_DIR" ] || [ -z "$OUT_PREFIX" ] || [ -z "$WORK_DIR" ] || [ -z "$VENV" ]; then
15 |   echo "Usage:   $0 -w [work_dir]  -v [venv] -i [in_file] -o [out_dir] -p [out_prefix]"
16 |   echo ""
17 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -i /tmp -o /tmp -p test"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
21 |   echo "  [venv] The path where a python3 virtualenv has been created."
22 |   echo "  [in_file] The input localtrust (i,j,v edge list) csv file."
23 |   echo "  [out_dir] The output directory to write the graph file."
24 |   echo "  [out_prefix] The prefix of the output graph files."
25 |   echo ""
26 |   exit
27 | fi
28 | 
29 | 
30 | source $WORK_DIR/.env
31 | 
32 | # set -x
33 | set -e
34 | set -o pipefail
35 | 
36 | function log() {
37 |   echo "`date` - $1"
38 | }
39 | 
40 | mkdir -p $OUT_DIR
41 | 
42 | source $VENV/bin/activate
43 | #pip install -r requirements.txt
44 | python3 -m graph.gen_igraph -i $IN_FILE -o $OUT_DIR -p $OUT_PREFIX
45 | touch $OUT_DIR/${OUT_PREFIX}_SUCCESS
46 | deactivate
47 | 


--------------------------------------------------------------------------------
/pipeline/run_notify_channel_daily_trending.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts "w:v:c:d" flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         c) CSV_PATH=${OPTARG};;
 9 |         d) DRYRUN_FLAG="--dry-run";;
10 |     esac
11 | done
12 | 
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$CSV_PATH" ]; then
14 |   echo "Usage:   $0 -w [work_dir] -v [venv] -c [csv_path]  -d"
15 |   echo ""
16 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv"
17 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -c channels/Trending_Channels.csv -d"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
21 |   echo "  [venv] The path where a python3 virtualenv has been created."
22 |   echo "  [csv_path] Path to CSV file."
23 |   echo "  [dryrun] Flag to run the script in dry-run mode."
24 |   echo ""
25 |   exit
26 | fi
27 | 
28 | source $WORK_DIR/.env
29 | 
30 | # set -x
31 | set -e
32 | set -o pipefail
33 | 
34 | function log() {
35 |   echo "`date` - $1"
36 | }
37 | 
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_notify_daily_trending -c "$CSV_PATH" $DRYRUN_FLAG
41 | deactivate
42 | 


--------------------------------------------------------------------------------
/pipeline/run_notify_channel_leaderboard.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts "w:v:rd" flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         r) RUN_FLAG="--run";;
 9 |         d) DRYRUN_FLAG="--dry-run";;
10 |     esac
11 | done
12 | 
13 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$RUN_FLAG" ]; then
14 |   echo "Usage:   $0 -w [work_dir] -v [venv] -r -d"
15 |   echo ""
16 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r"
17 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -r -d"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
21 |   echo "  [venv] The path where a python3 virtualenv has been created."
22 |   echo "  [run] Flag to run the script."
23 |   echo "  [dryrun] Flag to run the script in dry-run mode."
24 |   echo ""
25 |   exit
26 | fi
27 | 
28 | source $WORK_DIR/.env
29 | 
30 | # set -x
31 | set -e
32 | set -o pipefail
33 | 
34 | function log() {
35 |   echo "`date` - $1"
36 | }
37 | 
38 | source $VENV/bin/activate
39 | #pip install -r requirements.txt
40 | python3 -m channels.main_notify_leaderboard $RUN_FLAG $DRYRUN_FLAG
41 | deactivate
42 | 


--------------------------------------------------------------------------------
/pipeline/run_notify_channel_weekly_mods.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts "w:v:b:s:d" flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         b) BOTS_CSV=${OPTARG};;
 9 |         s) SINCE_DATETIME=${OPTARG};;
10 |         d) DRYRUN_FLAG="--dry-run";;
11 |     esac
12 | done
13 | 
14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$BOTS_CSV" ] || [ -z "$SINCE_DATETIME" ]; then
15 |   echo "Usage:   $0 -w [work_dir] -v [venv] -b [bots_csv] -s [since_datetime] -d"
16 |   echo ""
17 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00'"
18 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -b channels/Bot_Fids.csv -s '2025-04-23 16:30:00+00:00' -d"
19 |   echo ""
20 |   echo "Params:"
21 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
22 |   echo "  [venv] The path where a python3 virtualenv has been created."
23 |   echo "  [bots_csv] The path to the CSV file that has list of mod bots."
24 |   echo "  [since_datetime] The datetime to get notifications since."
25 |   echo "  [dryrun] Flag to run the script in dry-run mode."
26 |   echo ""
27 |   exit
28 | fi
29 | 
30 | source $WORK_DIR/.env
31 | 
32 | # set -x
33 | set -e
34 | set -o pipefail
35 | 
36 | function log() {
37 |   echo "`date` - $1"
38 | }
39 | 
40 | source $VENV/bin/activate
41 | #pip install -r requirements.txt
42 | python3 -m channels.main_notify_weekly_mods -b "$BOTS_CSV"  -s "$SINCE_DATETIME" $DRYRUN_FLAG
43 | deactivate
44 | 


--------------------------------------------------------------------------------
/pipeline/run_update_channel_points.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w:v:t:p:g: flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         t) TASK=${OPTARG};;
 9 |         p) POSTGRES=${OPTARG};;
10 |         g) GAPFILL_DATE=${OPTARG};;
11 |     esac
12 | done
13 | 
14 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then
15 |   echo "Usage:   $0 -w [work_dir] -v [venv] -t [task]"
16 |   echo "Usage:   $0 -w [work_dir] -v [venv] -t [task] -p [postgres]"
17 |   echo "Usage:   $0 -w [work_dir] -v [venv] -t gapfill -p [postgres] -g [gapfill_date] "
18 |   echo ""
19 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t genesis"
20 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t compute"
21 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update"
22 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t update -p eigen8 -g 2025-04-01"
23 |   echo ""
24 |   echo "Params:"
25 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
26 |   echo "  [venv] The path where a python3 virtualenv has been created."
27 |   echo "  [task] The task to perform: daily or distrib."
28 |   echo "  [postgres] The name of the postgres database to connect to."
29 |   echo "  [gapfill_date] The date to use for gapfilling in YYYY-MM-DD format."
30 |   echo ""
31 |   exit
32 | fi
33 | 
34 | if [ ! -z "$POSTGRES" ]; then
35 |   PG_OPTION="--postgres $POSTGRES"
36 | fi
37 | 
38 | if [ "$TASK" = "gapfill" ]; then
39 |   if [ -z "$GAPFILL_DATE" ]; then
40 |     echo "Please specify -g (gapfill_date) for the gapfill task."
41 |     exit 1
42 |   fi
43 | fi
44 | 
45 | # validating TARGET_MONTH in bash is a bit of a pain
46 | # ... let the python script validate it
47 | if [ ! -z "$GAPFILL_DATE" ]; then
48 |   GAPFILL_OPTION="--gapfill-date $GAPFILL_DATE"
49 | fi
50 | 
51 | source $WORK_DIR/.env
52 | 
53 | # set -x
54 | set -e
55 | set -o pipefail
56 | 
57 | function log() {
58 |   echo "`date` - $1"
59 | }
60 | 
61 | source $VENV/bin/activate
62 | #pip install -r requirements.txt
63 | python3 -m channels.main_points -t "$TASK" $PG_OPTION $GAPFILL_OPTION
64 | deactivate
65 | 


--------------------------------------------------------------------------------
/pipeline/run_update_channel_tokens.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w:v:t:s:r:p: flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |         v) VENV=${OPTARG};;
 8 |         t) TASK=${OPTARG};;
 9 |         s) SCOPE=${OPTARG};;
10 |         r) REASON=${OPTARG};;
11 |         p) POSTGRES=${OPTARG};;
12 |     esac
13 | done
14 | 
15 | if [ -z "$WORK_DIR" ] || [ -z "$VENV" ] || [ -z "$TASK" ]; then
16 |   echo "Usage:   $0 -w [work_dir] -v [venv] -t [task]"
17 |   echo "Usage:   $0 -w [work_dir] -v [venv] -t [task] -s [scope] -r [reason] -p [postgres]"
18 |   echo ""
19 |   echo "Example: $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t prep -s weekly -r reason -p eigen8"
20 |   echo "         $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t distrib"
21 |   echo "         $0 -w . -v /home/ubuntu/venvs/fc-graph-env3/ -t verify"
22 |   echo ""
23 |   echo "Params:"
24 |   echo "  [work_dir] The working directory to read .env file and execute scripts from."
25 |   echo "  [venv] The path where a python3 virtualenv has been created."
26 |   echo "  [task] The task to perform: prep or distrib or verify."
27 |   echo "  [scope] The scope of channels to import: airdrop or daily calculation."
28 |   echo "  [reason] The reason for the distribution."
29 |   echo "  [postgres] The name of the postgres database to connect to."
30 |   echo ""
31 |   exit
32 | fi
33 | 
34 | if [ "$TASK" = "prep" ]; then
35 |   if [ -z "$SCOPE" ] || [ -z "$REASON" ]; then
36 |     echo "Please specify -s (scope) and -r (reason) for the prep task."
37 |     exit 1
38 |   fi
39 | fi
40 | 
41 | if [ ! -z "$POSTGRES" ]; then
42 |   PG_OPTION="--postgres $POSTGRES"
43 | fi
44 | 
45 | source $WORK_DIR/.env
46 | 
47 | # set -x
48 | set -e
49 | set -o pipefail
50 | 
51 | function log() {
52 |   echo "`date` - $1"
53 | }
54 | 
55 | source $VENV/bin/activate
56 | #pip install -r requirements.txt
57 | if [ "$TASK" = "prep" ]; then
58 |     python3 -m channels.main_tokens -t prep -s "$SCOPE" -r "$REASON" $PG_OPTION
59 |     deactivate
60 | elif [ "$TASK" = "distrib" ]; then
61 |     python3 -m channels.main_tokens -t distrib $PG_OPTION
62 |     deactivate    
63 | elif [ "$TASK" = "verify" ]; then      
64 |     python3 -m channels.main_tokens -t verify $PG_OPTION
65 |     deactivate
66 | else
67 |     echo "Invalid task specified. Use 'prep', 'distrib' or 'verify'."
68 |     exit 1    
69 | fi
70 | 


--------------------------------------------------------------------------------
/pipeline/samples/pretrust.csv:
--------------------------------------------------------------------------------
1 | i,v
2 | 2,0.5
3 | 3,0.5
4 | 


--------------------------------------------------------------------------------
/pipeline/schema/globaltrust_config.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- PostgreSQL database dump
 3 | --
 4 | 
 5 | -- Dumped from database version 16.2
 6 | -- Dumped by pg_dump version 16.2
 7 | 
 8 | SET statement_timeout = 0;
 9 | SET lock_timeout = 0;
10 | SET idle_in_transaction_session_timeout = 0;
11 | SET client_encoding = 'UTF8';
12 | SET standard_conforming_strings = on;
13 | SELECT pg_catalog.set_config('search_path', '', false);
14 | SET check_function_bodies = false;
15 | SET xmloption = content;
16 | SET client_min_messages = warning;
17 | SET row_security = off;
18 | 
19 | SET default_tablespace = '';
20 | 
21 | SET default_table_access_method = heap;
22 | 
23 | --
24 | -- Name: globaltrust_config; Type: TABLE; Schema: public; Owner: k3l_user
25 | --
26 | 
27 | CREATE TABLE public.globaltrust_config (
28 |     strategy_id integer NOT NULL,
29 |     strategy_name character varying(255) NOT NULL,
30 |     pretrust text,
31 |     localtrust text,
32 |     alpha real,
33 |     date date DEFAULT CURRENT_TIMESTAMP NOT NULL
34 | );
35 | 
36 | 
37 | ALTER TABLE public.globaltrust_config OWNER TO k3l_user;
38 | 
39 | --
40 | -- Data for Name: globaltrust_config; Type: TABLE DATA; Schema: public; Owner: k3l_user
41 | --
42 | 
43 | COPY public.globaltrust_config (strategy_id, strategy_name, pretrust, localtrust, alpha, date) FROM stdin;
44 | 1	follows	pretrustAllEqually	existingConnections	0.5	2023-12-07
45 | 3	engagement	pretrustAllEqually	l1rep6rec3m12enhancedConnections	0.5	2023-12-07
46 | 5	activity	pretrustAllEqually	l1rep1rec1m1enhancedConnections	0.5	2023-12-07
47 | 7	OG circles	pretrustSpecificUsernames	existingConnections	0.5	2023-12-07
48 | 9	OG engagement	pretrustSpecificUsernames	l1rep6rec3m12enhancedConnections	0.5	2023-12-07
49 | 11	OG activity	pretrustSpecificUsernames	l1rep1rec1m1enhancedConnections	0.5	2023-12-07
50 | 1	follows	pretrustTopTier	existingConnections	0.5	2024-03-14
51 | 3	engagement	pretrustTopTier	l1rep6rec3m12enhancedConnections	0.5	2024-03-14
52 | 1	follows	pretrustTopTier	existingConnections	0.5	2024-09-27
53 | 3	engagement	pretrustTopTier	l1rep6rec3m12enhancedConnections	0.5	2024-09-27
54 | 9	v3engagement	v2pretrustTopTier	followsboostedl1rep3rec6m12	0.5	2024-09-27
55 | \.
56 | 
57 | 
58 | --
59 | -- Name: globaltrust_config globaltrust_config_pkey; Type: CONSTRAINT; Schema: public; Owner: k3l_user
60 | --
61 | 
62 | ALTER TABLE ONLY public.globaltrust_config
63 |     ADD CONSTRAINT globaltrust_config_pkey PRIMARY KEY (strategy_id, date);
64 | 
65 | 
66 | --
67 | -- Name: TABLE globaltrust_config; Type: ACL; Schema: public; Owner: k3l_user
68 | --
69 | 
70 | GRANT SELECT,REFERENCES ON TABLE public.globaltrust_config TO k3l_readonly;
71 | 
72 | 
73 | --
74 | -- PostgreSQL database dump complete
75 | --
76 | 
77 | 


--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_create_degen_db_functions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | while getopts v:t: flag
 7 | do
 8 |     case "${flag}" in
 9 |         v) VENV=${OPTARG};;
10 |         t) TASK=${OPTARG};;
11 |     esac
12 | done
13 | 
14 | if [ -z "$VENV" ]; then
15 |   echo "Usage:   $0 -v [venv]"
16 |   echo ""
17 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/"
18 |   echo ""
19 |   echo "Params:"
20 |   echo "  [venv] The path where a python3 virtualenv has been created."
21 |   echo "  [task] The task to perform: 'extract' or 'insert_scores'."
22 |   echo ""
23 |   exit
24 | fi
25 | 
26 | # set -x
27 | set -e
28 | set -o pipefail
29 | 
30 | source $VENV/bin/activate
31 | # pip install -r requirements.txt
32 | 
33 | echo "Executing task: $TASK"
34 | if [ "$TASK" = "extract" ]; then
35 |   python3 -m degen.create_degen_sql_functions
36 | elif [ "$TASK" = "insert_scores" ]; then
37 |   python3 -m degen.calculate_rank
38 | else
39 |   echo "Invalid task specified. Use 'extract' or 'insert_scores'."
40 |   exit 1
41 | fi
42 |   deactivate
43 | 


--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_sandbox_backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source .env
 4 | 
 5 | set -x
 6 | set -e  # Exit immediately if a command exits with a non-zero status
 7 | set -o pipefail  # Ensure pipeline failures are propagated
 8 | 
 9 | 
10 | # TODO: move this to cli args
11 | DATE_SUFFIX=$(date +"%Y%m%d" )
12 | BACKUP_DIR="/tmp/sandbox-backup-$DATE_SUFFIX"
13 | BACKUP_FILE="sandbox_pgdump"
14 | S3_BUCKET='k3l-farcaster-backups'
15 | S3_PREFIX='pg_dump/'  
16 | 
17 | #DB details
18 | DB_NAME=$SANDBOX_DB_NAME
19 | DB_USER=$SANDBOX_DB_USER
20 | DB_PASSWORD=$SANDBOX_DB_PASSWORD
21 | DB_HOST=$SANDBOX_DB_HOST
22 | DB_PORT=$SSH_LISTEN_PORT
23 | 
24 | rm -rf "$BACKUP_DIR"
25 | mkdir -p "$BACKUP_DIR"
26 | 
27 | # Perform the backup
28 | echo "Starting backup..."
29 | set +x  # Disable command echoing
30 | export PGPASSWORD="$DB_PASSWORD"
31 | set -x  # Re-enable command echoing
32 | pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
33 |   -j 1 \
34 |   -Fd \
35 |   -f "$BACKUP_DIR/$BACKUP_FILE"
36 | unset PGPASSWORD
37 | 
38 | # Check if backup was successful
39 | if [ $? -eq 0 ]; then
40 |     echo "Backup completed successfully"
41 | 
42 |     # Compress the backup
43 |     tar czf "$BACKUP_DIR/$BACKUP_FILE.tgz" -C "$BACKUP_DIR" $BACKUP_FILE
44 |     echo "Backup compressed"
45 | 
46 |     # Upload to S3
47 |     echo "Uploading backup to S3..."
48 |     aws s3 cp "$BACKUP_DIR/$BACKUP_FILE.tgz" "s3://$S3_BUCKET/$S3_PREFIX$BACKUP_FILE.tgz"
49 | 
50 |     if [ $? -eq 0 ]; then
51 |         echo "Backup successfully uploaded to S3"
52 |         rm -rf "$BACKUP_DIR"
53 |     else
54 |         echo "Failed to upload backup to S3"
55 |         exit 1
56 |     fi
57 | else
58 |     echo "Backup failed"
59 |     exit 1
60 | fi
61 | 
62 | exit 0
63 | 


--------------------------------------------------------------------------------
/pipeline/scripts/archived/run_urlextract_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts w: flag
 4 | do
 5 |     case "${flag}" in
 6 |         w) WORK_DIR=${OPTARG};;
 7 |     esac
 8 | done
 9 | 
10 | if [ -z "$WORK_DIR" ]; then
11 |   echo "Usage:   $0 -w [work_dir]"
12 |   echo ""
13 |   echo "Example: $0 -w ."
14 |   echo ""
15 |   echo "Params:"
16 |   echo "  [work_dir]  The working directory to read .env file and execute scripts from."
17 |   echo ""
18 |   exit
19 | fi
20 | 
21 | source $WORK_DIR/.env
22 | 
23 | DB_HOST=${DB_HOST:-127.0.0.1}
24 | DB_PORT=${DB_PORT:-5432}
25 | DB_USER=${DB_USER:-replicator}
26 | DB_NAME=${DB_NAME:-replicator}
27 | DB_PASSWORD=${DB_PASSWORD:-password} # psql requires PGPASSWORD to be set
28 | 
29 | # set -x
30 | set -e
31 | set -o pipefail
32 | 
33 | if hash psql 2>/dev/null; then
34 |   echo "OK, you have psql in the path. We’ll use that."
35 |   PSQL=psql
36 | else
37 |   echo "You don't have psql is the path. Let's try /usr/bin"
38 |   hash /usr/bin/psql
39 |   PSQL=/usr/bin/psql
40 | fi
41 | 
42 | function log() {
43 |   echo "`date` - $1"
44 | }
45 | 
46 | log "Inserting into k3l_url_labels"
47 | PGPASSWORD=$DB_PASSWORD \
48 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
49 |   -f $WORK_DIR/frames/incremental_load_labels.sql 
50 | 
51 | wait $!
52 | 
53 | log "Inserting into k3l_cast_embed_url_mapping"
54 | PGPASSWORD=$DB_PASSWORD \
55 | $PSQL -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME \
56 |   -f $WORK_DIR/frames/incremental_load_cast_mapping.sql
57 | 
58 | wait $!
59 | 
60 | this_name=`basename "$0"`
61 | log "$this_name done!"


--------------------------------------------------------------------------------
/pipeline/scripts/one_off/diff_db_table.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     print("Not implemented")
3 |     pass


--------------------------------------------------------------------------------
/pipeline/scripts/one_off/run_cast_pipeline_gapfills.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | date_format='%Y-%m-%d'
 4 | 
 5 | # Function to validate date format
 6 | function validate_date() {
 7 |     date_to_check=$1
 8 | 
 9 |     # Check if the date matches the format YYYY-mm-dd
10 |     if [[ $(uname) == "Darwin" ]]; then
11 |       if ! date -j -f "$date_format" "$date_to_check" >/dev/null 2>&1; then
12 |         echo "Invalid date format. Use YYYY-mm-dd."
13 |         exit 1
14 |       fi
15 |     else
16 |       if ! date -d "$date_to_check" +"$date_format" >/dev/null 2>&1; then
17 |         echo "Invalid date format. Use YYYY-mm-dd."
18 |         exit 1
19 |       fi
20 |     fi
21 | 
22 |     # Check if the date is in the past
23 |     today=$(date +"$date_format")
24 |     if [ "$date_to_check" \> "$today" ] || [ "$date_to_check" == "$today" ]; then
25 |       echo "The date must be in the past and not include today."
26 |       exit 1
27 |     fi
28 | }
29 | 
30 | while getopts v:s:p:e:l: flag
31 | do
32 |     case "${flag}" in
33 |         v) VENV=${OPTARG};;
34 |         s) START_DATE=${OPTARG};;
35 |         e) END_DATE=${OPTARG};;
36 |         p) POSTGRES=${OPTARG};;
37 |         l) SLEEP_TIME=${OPTARG};;
38 |     esac
39 | done
40 | 
41 | if [ -z "$VENV" ] || [ -z "$START_DATE" ] || [ -z "$END_DATE" ]; then
42 |   echo "Usage:   $0 -v [venv] -s [start_date] -e [end_date]"
43 |   echo "Usage:   $0 -v [venv] -s [start_date] -e [end_date] -p [postgres] -l [sleep_time]"
44 |   echo ""
45 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05"
46 |   echo "Example: $0 -v /home/ubuntu/venvs/fc-graph-env3/ -s 2025-02-01 -e 2025-02-05 -p eigen8"
47 |   echo ""
48 |   echo "Params:"
49 |   echo "  [venv] The path where a python3 virtualenv has been created."
50 |   echo "  [start_date] The date to start the gapfilling process."
51 |   echo "  [end_date] The date to end the gapfilling process."
52 |   echo "  [postgres] 'eigen2' or 'eigen8'"
53 |   echo "  [sleep_time] The amount of time to sleep between gapfill runs."
54 |   echo ""
55 |   exit
56 | fi
57 | 
58 | if [ ! -z "$POSTGRES" ]; then
59 |   PG_OPTION="--postgres $POSTGRES"
60 | fi
61 | 
62 | validate_date $START_DATE
63 | validate_date $END_DATE
64 | 
65 | SLEEP_TIME=${SLEEP_TIME:-30s}
66 | 
67 | 
68 | # set -x
69 | set -e
70 | set -o pipefail
71 | 
72 | function log() {
73 |   echo "`date` - $1"
74 | }
75 | 
76 | source $VENV/bin/activate
77 | # pip install -r requirements.txt
78 | while [[ $START_DATE < $END_DATE ]]; do
79 |   DATE_OPTION=(--target-date "$START_DATE 00:00:00")
80 |   FILL_TYPE="gapfill"
81 |   DAEMON_FLAG=""
82 |   log "Running gapfill for $START_DATE"
83 |   python3 -m casts.main $PG_OPTION $DAEMON_FLAG -f $FILL_TYPE "${DATE_OPTION[@]}"
84 |   log "Sleeping for $SLEEP_TIME"
85 |   sleep $SLEEP_TIME
86 |   START_DATE=$(date -I -d "$START_DATE + 1 day")
87 | done
88 | deactivate
89 | 
90 | log "Done"
91 | 


--------------------------------------------------------------------------------
/pipeline/sshtunnel.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.8
 2 | 
 3 | RUN apk add --no-cache autossh libressl
 4 | 
 5 | RUN mkdir -p ~/.ssh
 6 | 
 7 | ENTRYPOINT ["/usr/bin/autossh", \
 8 |   "-M", "0", "-T", "-N", "-g", "-v", \
 9 |   "-oStrictHostKeyChecking=no", \
10 |   "-oServerAliveInterval=180", \
11 |   "-oUserKnownHostsFile=/dev/null", \
12 |   "-oGlobalKnownHostsFile=/dev/null", \
13 |   "-i/root/.ssh/id_rsa"]


--------------------------------------------------------------------------------
/pipeline/timer.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://realpython.com/python-timer/#the-python-timer-code
 2 | import time
 3 | from contextlib import ContextDecorator
 4 | from dataclasses import dataclass, field
 5 | from typing import Any, Callable, ClassVar, Dict, Optional
 6 | 
 7 | class TimerError(Exception):
 8 |     """A custom exception used to report errors in use of Timer class"""
 9 | 
10 | @dataclass
11 | class Timer(ContextDecorator):
12 |     """Time your code using a class, context manager, or decorator
13 |     Class:
14 |     ======
15 |     t = Timer(name="class")
16 |     t.start()
17 |     # Do something
18 |     t.stop()
19 |     Context Manager:
20 |     ================
21 |     with Timer(name="context manager"):
22 |         # Do something
23 |     Decorator:
24 |     ==========
25 |     @Timer(name="decorator")
26 |     def stuff():
27 |         # Do something
28 |     """
29 | 
30 |     timers: ClassVar[Dict[str, float]] = {}
31 |     name: Optional[str] = None
32 |     text: str = "Elapsed time: {n} took {t:0.4f} seconds"
33 |     logger: Optional[Callable[[str], None]] = print
34 |     _start_time: Optional[float] = field(default=None, init=False, repr=False)
35 | 
36 |     def __post_init__(self) -> None:
37 |         """Initialization: add timer to dict of timers"""
38 |         if self.name:
39 |             self.timers.setdefault(self.name, 0)
40 | 
41 |     def start(self) -> None:
42 |         """Start a new timer"""
43 |         if self._start_time is not None:
44 |             raise TimerError(f"Timer is running. Use .stop() to stop it")
45 |         self.logger("Start a new timer: {n}".format(n=self.name))
46 |         self._start_time = time.perf_counter()
47 | 
48 |     def stop(self) -> float:
49 |         """Stop the timer, and report the elapsed time"""
50 |         if self._start_time is None:
51 |             raise TimerError(f"Timer is not running. Use .start() to start it")
52 | 
53 |         # Calculate elapsed time
54 |         elapsed_time = time.perf_counter() - self._start_time
55 |         self._start_time = None
56 | 
57 |         # Report elapsed time
58 |         if self.logger:
59 |             self.logger(self.text.format(n=self.name, t=elapsed_time))
60 |         if self.name:
61 |             self.timers[self.name] += elapsed_time
62 | 
63 |         return elapsed_time
64 | 
65 |     def __enter__(self) -> "Timer":
66 |         """Start a new timer as a context manager"""
67 |         self.start()
68 |         return self
69 | 
70 |     def __exit__(self, *exc_info: Any) -> None:
71 |         """Stop the context manager timer"""
72 |         self.stop()


--------------------------------------------------------------------------------
/pipeline/tmp/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/pipeline/tmp/.placeholder


--------------------------------------------------------------------------------
/scripts/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/scripts/.placeholder


--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/.env.sample:
--------------------------------------------------------------------------------
1 | NGINX_CONFIG="/etc/nginx/sites-enabled/graph.cast.k3l.io"
2 | WORK_DIR="/home/ubuntu/graphcast_jobs"
3 | REMOTE_USER="ubuntu"
4 | REMOTE_DIR="/home/ubuntu/graphcast_jobs/"
5 | SSH_PRIV_KEY="/home/ubuntu/.ssh/id_graphcast_jobs"
6 | 
7 | 


--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/README.md:
--------------------------------------------------------------------------------
 1 | We use letsencrypt to issue SSL certs for our domains.
 2 | 
 3 | # Step 1. graph.castN.k3l.io
 4 | 
 5 | Example, graph.cast9.k3l.io. This sub-domain is not load-balanced but is very useful when we want to simulate a blue-green deployment. Also, setting up this sub-domain also makes the next step simple.
 6 | 
 7 | A typical crontab to both **install** as well as **renew** certs looks like this:
 8 | ``` 
 9 | 1 0 */7 * * sudo certbot run --nginx -d graph.cast9.k3l.io -m ops@karma3labs.com --agree-tos -n
10 | ```
11 | This crontab assumes that `/etc/nginx/sites-available/` is aleady configured for the sub-domain name.
12 | 
13 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `N` with your preferred number. 
14 | Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.castN.k3l.io /etc/nginx/sites-enabled/` 
15 | **NOTE** the sample file does not have ssl config because certbot will add the appropriate config when certbot is run for the first time `sudo certbot run --nginx -d graph.castN.k3l.io -m ops@karma3labs.com --agree-tos -n`
16 | 
17 | # Step 2. graph.cast.k3l.io
18 | The sub-domain `graph.cast.k3l.io` is load-balanced across multiple machines. When renewing certs, we cannot have certs renewed from multiple machines and have them invalidate the others. So, we renew certs on 1 machine and push the cert to all the other machines. 
19 | 
20 | The `install_certs.sh` script takes care of renewing the cert while `push_certs.sh` pushes the cert to the other machines.
21 | 
22 | #### Pre-req
23 | `/etc/nginx/sites-available/` should have a config for `graph.cast.k3l.io`
24 | 
25 | This repo has a sample nginx file that you can use. **REMEMBER** to replace `CHANGME_OPENSSL_RAND_KEY` with a strong api key. Also, **REMEMBER** to soft link the config file `sudo ln -s /etc/nginx/sites-available/graph.cast.k3l.io /etc/nginx/sites-enabled/`
26 | 
27 | #### Cronjobs
28 | A typical crontab on the **"primary"** host looks like this: 
29 | ```
30 | 15 0 */7 * * sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; sudo nginx -s reload >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1; date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./push_certs.sh -h 162.55.109.106 >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1;
31 | ```
32 | 1. renew cert `sudo certbot run --nginx -d graph.cast.k3l.io -m ops@karma3labs.com --agree-tos -n`
33 | 2. reload nginx locally to make sure cert is fine `sudo nginx -s reload`
34 | 3. push renewed cert to 162.55.109.106 `./push_certs.sh -h 162.55.109.106`
35 | 
36 | And, the crontab on the **"secondary"** host looks like this:
37 | ```
38 | 30 0 */7 * * date >> /var/log/farcaster-graph/graphcast_jobs.log ; cd /home/ubuntu/graphcast_jobs; ./install_certs.sh >> /var/log/farcaster-graph/graphcast_jobs.log 2>&1
39 | ```
40 | 1. install cert assuming that graph.cast.k3l.io nginx config already exists and the "primary" server has scp'd over the pem files.


--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/graph.castN.k3l.io:
--------------------------------------------------------------------------------
 1 | # Allow listed IP addresses with no rate limits
 2 | geo $limit {
 3 |     default 1;
 4 |     10.0.0.0/8 0;
 5 |     127.0.0.1/32 0;
 6 |     192.168.0.0/24 0;
 7 | }
 8 | 
 9 | map $limit $limit_key {
10 |     0 "";
11 |     1 $binary_remote_addr;
12 | }
13 | 
14 | # Specify 10 MB storage of binary IP addresses to keep track of 1.6 mil addresses
15 | # to limit at 5 requests/second
16 | limit_req_zone $limit_key zone=graph_castN_zone:10m rate=5r/s;
17 | 
18 | server {
19 |     server_name graph.castN.k3l.io;
20 | 
21 |     location ~* \.(env|git|bak|config|log|sh).* {
22 |         deny all;
23 |         return 404;
24 |     }
25 | 
26 | 
27 |     location ~ ^/(_pause|_resume) {
28 |         return 404;
29 |     }
30 | 
31 |     location / {
32 |         # apply rate limit
33 |         limit_req zone=graph_castN_zone burst=10;
34 |         proxy_pass http://localhost:8000;
35 |         proxy_http_version 1.1;
36 |         proxy_set_header Upgrade $http_upgrade;
37 |         proxy_set_header Connection 'upgrade';
38 |         proxy_set_header Host $host;
39 |         proxy_cache_bypass $http_upgrade;
40 |     }
41 | 
42 | }
43 | 
44 | server {
45 |     server_name graph.castN.k3l.io;
46 | 
47 |     location ~* \.(woff|jpg|jpeg|png|gif|ico|css|js)$ {
48 |       access_log off;
49 |     }
50 | 
51 |     listen 80;
52 | }
53 | 


--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/install_certs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Function to log messages with a timestamp
 6 | log_message() {
 7 |     echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
 8 | }
 9 | 
10 | # Source the environment variables from the .env file
11 | if [ -f .env ]; then
12 |     source .env
13 | else
14 |     log_message "Error: .env file not found."
15 |     exit 1
16 | fi
17 | 
18 | # Check if CONFIG and WORK_DIR are set
19 | if [ -z "$NGINX_CONFIG" ] || [ -z "$WORK_DIR" ]; then
20 |     log_message "Error: CONFIG and WORK_DIR environment variables must be set."
21 |     exit 1
22 | fi
23 | 
24 | log_message "Starting check_certificates.sh script."
25 | 
26 | # Extract the certificate file paths from the Nginx config file
27 | log_message "Extracting certificate file paths from the Nginx config file."
28 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';')
29 | 
30 | # Flag to indicate if any files were moved
31 | FILES_MOVED=false
32 | 
33 | # Check and move the files if they exist
34 | for FILE in $CERT_FILES; do
35 |     FILE_NAME=$(basename $FILE)
36 |     DIR_NAME=$(dirname $FILE)
37 |     if [ -f ${WORK_DIR}/${FILE_NAME} ]; then
38 |         log_message "Moving ${WORK_DIR}/${FILE_NAME} to $FILE."
39 |         sudo mkdir -p $DIR_NAME
40 |         sudo mv ${WORK_DIR}/${FILE_NAME} $FILE
41 |         FILES_MOVED=true
42 |     else
43 |         log_message "File ${WORK_DIR}/${FILE_NAME} not found."
44 |     fi
45 | done
46 | 
47 | # Reload Nginx if any files were moved
48 | if [ "$FILES_MOVED" = true ]; then
49 |     log_message "Files moved. Reloading Nginx."
50 |     sudo nginx -s reload
51 | else
52 |     log_message "No files moved. Nginx reload not required."
53 | fi
54 | 
55 | log_message "Script completed."
56 | 


--------------------------------------------------------------------------------
/scripts/certs/graphcast_jobs/push_certs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while getopts h: flag
 4 | do
 5 | 	case "${flag}" in
 6 |         h) REMOTE_HOST=${OPTARG};;
 7 |     esac
 8 | done
 9 | 
10 | if [ -z "$REMOTE_HOST" ]; then
11 | 	echo "Usage:   $0 -h [remote_host]"
12 | 	echo ""
13 | 	echo "Example: $0 -h 37.27.108.188"
14 | 	echo ""
15 | 	echo "Params:"
16 | 	echo "  [remote_host]	host to which the pem files have to be copied over to"
17 | 	echo""
18 | 	exit
19 | fi
20 | 
21 | 
22 | # Function to log messages with a timestamp
23 | log_message() {
24 |     echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
25 | }
26 | 
27 | # Source the environment variables from the .env file
28 | if [ -f .env ]; then
29 |     log_message "Loading .env file."
30 |     source .env
31 | else
32 |     log_message "Error: .env file not found."
33 |     exit 1
34 | fi
35 | 
36 | # Check if NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, and REMOTE_DIR are set
37 | if [ -z "$NGINX_CONFIG" ] || [ -z "$REMOTE_USER" ] || [ -z "$REMOTE_DIR" ] || [ -z "$SSH_PRIV_KEY" ]; then
38 |     log_message "Error: NGINX_CONFIG, REMOTE_USER, REMOTE_HOST, REMOTE_DIR and SSH_PRIV_KEY environment variables must be set."
39 |     exit 1
40 | fi
41 | 
42 | log_message "Starting sync_certificates.sh script."
43 | 
44 | # Extract the certificate file paths from the Nginx config file
45 | log_message "Extracting certificate file paths from the Nginx config file."
46 | CERT_FILES=$(grep -E 'ssl_certificate|ssl_certificate_key' $NGINX_CONFIG | awk '{print $2}' | tr -d ';')
47 | 
48 | # SCP the certificate files to the remote server
49 | for FILE in $CERT_FILES; do
50 |     log_message "Transferring $FILE to ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}."
51 |     sudo scp -p -i $SSH_PRIV_KEY $FILE ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}
52 | done
53 | 
54 | log_message "Script completed."
55 | 


--------------------------------------------------------------------------------
/serve/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .env.sample
3 | scratchpad.md


--------------------------------------------------------------------------------
/serve/.env.sample:
--------------------------------------------------------------------------------
 1 | DB_PASSWORD=password
 2 | DB_HOST=host
 3 | DB_NAME=postgres
 4 | DB_USERNAME=postgres
 5 | DB_PORT=5432
 6 | 
 7 | GO_EIGENTRUST_URL=http://localhost:8080
 8 | 
 9 | FOLLOW_GRAPH_PATHPREFIX=./samples/fc_following_fid
10 | ENGAGEMENT_GRAPH_PATHPREFIX=./samples/fc_engagement_fid
11 | NINETYDAYS_GRAPH_PATHPREFIX=./samples/fc_90dv3_fid
12 | 
13 | # SWAGGER_BASE_URL='CHANGE THIS AND UNCOMMENT'
14 | # CURA_API_KEY='CHANGE THIS AND UNCOMMENT'
15 | 
16 | USE_PANDAS_PERF='True or False ?'
17 | # optional overrides
18 | # LOG_LEVEL=INFO
19 | # LOG_LEVEL_CORE='DEBUG'
20 | # LOGURU_FORMAT='<green>{time:YYYY-MM-DD HH:mm:ss}</green> | {module}:{file}:{function}:{line} | {level} | <level>{message}</level>'
21 | 
22 | # POSTGRES_POOL_SIZE=5
23 | # POSTGRES_ECHO=False
24 | # POSTGRES_TIMEOUT_SECS=60
25 | 
26 | # EIGENTRUST_ALPHA=0.5
27 | # EIGENTRUST_EPSILON=1.0
28 | # EIGENTRUST_MAX_ITER=50
29 | # EIGENTRUST_FLAT_TAIL=2
30 | # GO_EIGENTRUST_TIMEOUT_MS=3000
31 | 
32 | # CURA_API_ENDPOINT=https://cura.network/api
33 | 


--------------------------------------------------------------------------------
/serve/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/serve/.idea/.name:
--------------------------------------------------------------------------------
1 | farcaster-graph-serve


--------------------------------------------------------------------------------
/serve/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectCodeStyleConfiguration">
2 |   <state>
3 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/serve/.idea/dataSources.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
 4 |     <data-source source="LOCAL" name="postgres@localhost" uuid="27027aa9-6ffe-4816-a090-291b895797a1">
 5 |       <driver-ref>postgresql</driver-ref>
 6 |       <synchronize>true</synchronize>
 7 |       <jdbc-driver>org.postgresql.Driver</jdbc-driver>
 8 |       <jdbc-url>jdbc:postgresql://localhost:9541/postgres</jdbc-url>
 9 |       <working-dir>$ProjectFileDir$</working-dir>
10 |     </data-source>
11 |   </component>
12 | </project>


--------------------------------------------------------------------------------
/serve/.idea/data_source_mapping.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="DataSourcePerFileMappings">
4 |     <file url="file://$APPLICATION_CONFIG_DIR$/consoles/db/00e7dc9c-e546-47b6-8caf-36e304730f6f/console.sql" value="00e7dc9c-e546-47b6-8caf-36e304730f6f" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/serve/.idea/farcaster-graph-serve.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Poetry (serve)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/serve/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/serve/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="Black">
 4 |     <option name="enabledOnReformat" value="true" />
 5 |     <option name="sdkName" value="Poetry (serve)" />
 6 |   </component>
 7 |   <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (serve)" project-jdk-type="Python SDK" />
 8 |   <component name="ProjectUserFactors">
 9 |     <factor id="completionFinishedType">
10 |       <dailyData date="14-06-2024">
11 |         <observation name="cancelled" value="8.0" />
12 |       </dailyData>
13 |       <dailyData date="25-06-2024">
14 |         <observation name="cancelled" value="8.0" />
15 |         <observation name="explicitSelect" value="3.0" />
16 |       </dailyData>
17 |     </factor>
18 |     <factor id="completionType">
19 |       <dailyData date="14-06-2024">
20 |         <observation name="BASIC" value="3.0" />
21 |       </dailyData>
22 |       <dailyData date="25-06-2024">
23 |         <observation name="BASIC" value="5.0" />
24 |       </dailyData>
25 |     </factor>
26 |     <factor id="completionUsage">
27 |       <dailyData date="14-06-2024">
28 |         <observation name="count" value="8.0" />
29 |       </dailyData>
30 |       <dailyData date="25-06-2024">
31 |         <observation name="count" value="11.0" />
32 |       </dailyData>
33 |     </factor>
34 |     <factor id="fullLineFactors" />
35 |     <factor id="itemPosition">
36 |       <dailyData date="25-06-2024">
37 |         <observation name="0" value="3.0" />
38 |       </dailyData>
39 |     </factor>
40 |     <factor id="mnemonicsUsage">
41 |       <dailyData date="25-06-2024">
42 |         <observation name="total" value="3.0" />
43 |         <observation name="withMnemonics" value="0.0" />
44 |       </dailyData>
45 |     </factor>
46 |     <factor id="prefixLength">
47 |       <dailyData date="25-06-2024">
48 |         <observation name="16" value="1.0" />
49 |         <observation name="33" value="1.0" />
50 |         <observation name="5" value="1.0" />
51 |       </dailyData>
52 |     </factor>
53 |     <factor id="prefixMatchingType">
54 |       <dailyData date="25-06-2024">
55 |         <observation name="GREEDY_WITH_CASE" value="3.0" />
56 |       </dailyData>
57 |     </factor>
58 |     <factor id="templatesUsage">
59 |       <dailyData date="25-06-2024">
60 |         <observation name="totalStartingWithPrefix" value="3.0" />
61 |       </dailyData>
62 |     </factor>
63 |     <factor id="timeBetweenTyping">
64 |       <dailyData date="14-06-2024">
65 |         <observation name="average" value="101.444444" />
66 |         <observation name="count" value="9.0" />
67 |       </dailyData>
68 |       <dailyData date="25-06-2024">
69 |         <observation name="average" value="251.828571" />
70 |         <observation name="count" value="35.0" />
71 |       </dailyData>
72 |     </factor>
73 |   </component>
74 |   <component name="PythonCompatibilityInspectionAdvertiser">
75 |     <option name="version" value="3" />
76 |   </component>
77 | </project>


--------------------------------------------------------------------------------
/serve/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/farcaster-graph-serve.iml" filepath="$PROJECT_DIR$/.idea/farcaster-graph-serve.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/serve/.idea/sqldialects.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="SqlDialectMappings">
4 |     <file url="PROJECT" dialect="PostgreSQL" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/serve/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/serve/.idea/watcherTasks.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectTasksOptions">
 4 |     <TaskOptions isEnabled="true">
 5 |       <option name="arguments" value="-b $PyInterpreterDirectory$ $FilePathRelativeToProjectRoot$" />
 6 |       <option name="checkSyntaxErrors" value="true" />
 7 |       <option name="description" />
 8 |       <option name="exitCodeBehavior" value="ERROR" />
 9 |       <option name="fileExtension" value="py" />
10 |       <option name="immediateSync" value="false" />
11 |       <option name="name" value="100-lint.sh" />
12 |       <option name="output" value="$FilePath$" />
13 |       <option name="outputFilters">
14 |         <array />
15 |       </option>
16 |       <option name="outputFromStdout" value="false" />
17 |       <option name="program" value="$ProjectFileDir$/scripts/lint.sh" />
18 |       <option name="runOnExternalChanges" value="true" />
19 |       <option name="scopeName" value="Project Files" />
20 |       <option name="trackOnlyRoot" value="false" />
21 |       <option name="workingDir" value="$ProjectFileDir$" />
22 |       <envs />
23 |     </TaskOptions>
24 |   </component>
25 | </project>


--------------------------------------------------------------------------------
/serve/Dockerfile:
--------------------------------------------------------------------------------
 1 | # FROM python:3.12-alpine
 2 | # not taking the alpine route because packages like psutil don't install without gcc
 3 | FROM python:3.12-slim
 4 | 
 5 | RUN pip install --upgrade pip
 6 | 
 7 | RUN pip install poetry
 8 | 
 9 | # single app container
10 | # no need to create virtual envs
11 | # install dependencies into the systems python environment 
12 | ENV POETRY_VERSION=1.7.1 \
13 |     POETRY_NO_INTERACTION=1 \
14 |     POETRY_VIRTUALENVS_CREATE=false
15 | 
16 | WORKDIR /code
17 | 
18 | COPY pyproject.toml poetry.lock ./
19 | COPY README.md ./
20 | 
21 | # we don't want to rebuild all the layers after every app code change
22 | # ignore app code for now
23 | # uncomment the next line if we start using dev/test specific dependencies
24 | # RUN poetry install --without dev,test --no-root
25 | RUN poetry install --no-root
26 | 
27 | COPY ./app /code/app
28 | COPY ./static /code/static
29 | COPY .env.docker ./.env
30 | 
31 | # install app code, this is the last image layer and has to be rebuilt
32 | # uncomment the next line if we start using dev/test specific dependencies
33 | # RUN poetry install --without dev,test
34 | RUN poetry install --no-root
35 | 
36 | EXPOSE 8000
37 | 
38 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
39 | 


--------------------------------------------------------------------------------
/serve/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/__init__.py


--------------------------------------------------------------------------------
/serve/app/dependencies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/dependencies/__init__.py


--------------------------------------------------------------------------------
/serve/app/dependencies/cache_db_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from asyncpg.pool import Pool
 4 | from loguru import logger
 5 | 
 6 | 
 7 | async def set_homefeed_for_fid(
 8 |     fid: int, cids: list[str], offset: int, cache_pool: Pool
 9 | ):
10 | 
11 |     session_data = {"api": "homefeed", "cids": cids, "offset": offset}
12 |     session_value = json.dumps(session_data)
13 |     key = f"session:{fid}"
14 | 
15 |     # TODO update db using cache_pool
16 |     pass
17 | 
18 | 
19 | async def get_homefeed_for_fid(fid: int, cache_pool: Pool) -> dict:
20 | 
21 |     key = f"session:{fid}"
22 | 
23 |     # TODO get cached data from db using cache_pool
24 | 
25 |     return {"cids": [], "offset": 0}
26 | 


--------------------------------------------------------------------------------
/serve/app/dependencies/db_pool.py:
--------------------------------------------------------------------------------
 1 | from fastapi import Request
 2 | 
 3 | 
 4 | # dependency to make it explicit that routers are accessing hidden state
 5 | def get_db(request: Request):
 6 |     return request.state.db_pool
 7 | 
 8 | 
 9 | def get_cache_db(request: Request):
10 |     return request.state.cache_db_pool
11 | 


--------------------------------------------------------------------------------
/serve/app/dependencies/logging.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import logging
 3 | 
 4 | from fastapi import Request
 5 | from loguru import logger
 6 | 
 7 | 
 8 | async def get_logger(request: Request):
 9 |     logger.debug(f"{request.method} {request.url}")
10 |     logger.debug("Params:")
11 |     for name, value in request.path_params.items():
12 |         logger.debug(f"\t{name}: {value}")
13 |     logger.debug("Headers:")
14 |     for name, value in request.headers.items():
15 |         logger.debug(f"\t{name}: {value}")
16 | 
17 | 
18 | class InterceptHandler(logging.Handler):
19 |     """
20 |     This intercept allows loguru to work with Python's standard logging module.
21 |     https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging
22 |     """
23 | 
24 |     def emit(self, record: logging.LogRecord) -> None:
25 |         # Get corresponding Loguru level if it exists.
26 |         level: str | int
27 |         try:
28 |             level = logger.level(record.levelname).name
29 |         except ValueError:
30 |             level = record.levelno
31 | 
32 |         # Find caller from where originated the logged message.
33 |         frame, depth = inspect.currentframe(), 0
34 |         while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__):
35 |             frame = frame.f_back
36 |             depth += 1
37 | 
38 |         logger.opt(depth=depth, exception=record.exc_info).log(
39 |             level, record.getMessage()
40 |         )
41 | 


--------------------------------------------------------------------------------
/serve/app/dependencies/memoize_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Tuple
 2 | 
 3 | from memoize.key import KeyExtractor
 4 | 
 5 | 
 6 | class EncodedMethodNameAndArgsExcludedKeyExtractor(KeyExtractor):
 7 |     """Encodes method name, args & kwargs to string and uses that as cache entry key.
 8 |     This KeyExtractor is class-centric and creates same keys for all objects of the same type.
 9 |     You can exclude args and kwargs by setting 'skip_args' and 'skip_kwargs' flags.
10 | 
11 |     Note: If wrapped function is a method (has 'self' as first positional arg) you may want to exclude 'self' from key
12 |     by setting 'skip_first_arg_as_self' flag.
13 |     For static methods of ordinary functions flag should be set to 'False'.
14 | 
15 |     Warning: uses method name only, so be cautious and do not wrap methods of different classes with the same names
16 |     while using same store and 'skip_first_arg_as_self' set to False."""
17 | 
18 |     def __init__(
19 |         self,
20 |         skip_first_arg_as_self=False,
21 |         skip_args: list[int] = [],
22 |         skip_kwargs: list[str] = [],
23 |     ) -> None:
24 |         self._skip_first_arg_as_self = skip_first_arg_as_self
25 |         self._skip_args = skip_args
26 |         self._skip_kwargs = skip_kwargs
27 | 
28 |     def format_key(
29 |         self, method_reference, call_args: Tuple[Any, ...], call_kwargs: Dict[str, Any]
30 |     ) -> str:
31 |         if self._skip_args:
32 |             call_args = [
33 |                 arg for i, arg in enumerate(call_args) if i not in self._skip_args
34 |             ]
35 |         if self._skip_kwargs:
36 |             call_kwargs = {
37 |                 k: v for k, v in call_kwargs.items() if k not in self._skip_kwargs
38 |             }
39 |         if self._skip_first_arg_as_self:
40 |             call_args.pop(0)
41 | 
42 |         return str(
43 |             (
44 |                 method_reference.__name__,
45 |                 call_args,
46 |                 call_kwargs,
47 |             )
48 |         )
49 | 
50 |     def __str__(self) -> str:
51 |         return self.__repr__()
52 | 
53 |     def __repr__(self) -> str:
54 |         return (
55 |             f"{self.__class__}"
56 |             f"[skip_first_arg_as_self={self._skip_first_arg_as_self}]"
57 |             f"[skip_args={self._skip_args}]"
58 |         )
59 | 


--------------------------------------------------------------------------------
/serve/app/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/models/__init__.py


--------------------------------------------------------------------------------
/serve/app/models/channel_model.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, StrEnum
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class ChannelRankingsTimeframe(str, Enum):
 7 |     LIFETIME = 'lifetime'
 8 |     SIXTY_DAYS = '60d'
 9 |     SEVEN_DAYS = '7d'
10 |     ONE_DAY = '1d'
11 | 
12 | 
13 | CHANNEL_RANKING_STRATEGY_NAMES = {
14 |     ChannelRankingsTimeframe.LIFETIME: 'channel_engagement',
15 |     ChannelRankingsTimeframe.SIXTY_DAYS: '60d_engagement',
16 |     ChannelRankingsTimeframe.SEVEN_DAYS: '7d_engagement',
17 |     ChannelRankingsTimeframe.ONE_DAY: '1d_engagement',
18 | }
19 | 
20 | 
21 | class OpenrankCategory(StrEnum):
22 |     TEST = 'test'
23 |     PROD = 'prod'
24 | 
25 | 
26 | # Deprecated
27 | class ChannelPointsOrderBy(StrEnum):
28 |     TOTAL_POINTS = 'total_points'
29 |     DAILY_POINTS = 'daily_points'
30 | 
31 | 
32 | class ChannelEarningsOrderBy(StrEnum):
33 |     TOTAL = 'total'
34 |     WEEKLY = 'weekly'
35 |     DAILY = 'daily'
36 |     LATEST = 'latest'
37 | 
38 | 
39 | class ChannelEarningsScope(StrEnum):
40 |     AIRDROP = 'airdrop'
41 |     DAILY = 'daily'
42 | 
43 | 
44 | class ChannelEarningsType(StrEnum):
45 |     POINTS = 'points'
46 |     TOKENS = 'tokens'
47 | 
48 | 
49 | class ChannelFidType(StrEnum):
50 |     MEMBER = 'member'
51 |     FOLLOWER = 'follower'
52 | 


--------------------------------------------------------------------------------
/serve/app/models/graph_model.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from enum import Enum
 3 | from typing import NamedTuple
 4 | 
 5 | import igraph
 6 | import pandas
 7 | 
 8 | 
 9 | class GraphType(Enum):
10 |     following = 1
11 |     #   engagement = 3
12 |     #   v3engagement = 9
13 |     ninetydays = 5
14 | 
15 | 
16 | class GraphTimeframe(str, Enum):
17 |     #   lifetime = "lifetime"
18 |     ninetydays = "90d"
19 | 
20 | 
21 | class Graph(NamedTuple):
22 |     success_file: str
23 |     df: pandas.DataFrame
24 |     graph: igraph.Graph
25 |     type: GraphType
26 |     mtime: float
27 | 
28 |     def __str__(self):
29 |         df_info = io.StringIO()
30 |         self.df.info(buf=df_info)
31 |         return f"""
32 |       type: {self.type}
33 |       dataframe: {df_info.getvalue()}
34 |       igraph: {self.graph.summary()}
35 |       mtime: {self.mtime}
36 |       """
37 | 


--------------------------------------------------------------------------------
/serve/app/models/score_model.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from enum import StrEnum
 3 | from typing import NamedTuple, Self
 4 | 
 5 | 
 6 | class ScoreAgg(StrEnum):
 7 |     RMS = 'rms'
 8 |     SUMSQUARE = 'sumsquare'
 9 |     SUM = 'sum'
10 |     SUMCUBEROOT = 'sumcuberoot'
11 | 
12 | 
13 | class Voting(StrEnum):
14 |     SINGLE = 'single'
15 |     MULTIPLE = 'multiple'
16 |     # TODO
17 |     # QUADRATIC = 'quadratic'
18 | 
19 | 
20 | class QueryType(StrEnum):
21 |     SUPERLITE = 'superlite'
22 |     LITE = 'lite'
23 |     HEAVY = 'heavy'
24 | 
25 | 
26 | class EngagementType(StrEnum):
27 |     V1 = '1.0'
28 |     V3 = '2.0'
29 | 
30 | 
31 | engagement_ids = dict()
32 | engagement_ids[EngagementType.V1] = 3
33 | engagement_ids[EngagementType.V3] = 9
34 | 
35 | 
36 | class Weights(NamedTuple):
37 |     cast: int = 10
38 |     recast: int = 5
39 |     reply: int = 7
40 |     like: int = 1
41 | 
42 |     @staticmethod
43 |     def from_str(weights_str: str) -> Self:
44 |         wts = re.search(
45 |             r'^([lL](\d{1,2}))?([cC](\d{1,2}))?([rR](\d{1,2}))?([yY](\d{1,2}))?$',
46 |             weights_str,
47 |         )
48 |         if wts is None:
49 |             raise Exception("Invalid weights")
50 |         return Weights(
51 |             like=0 if wts.group(2) is None else wts.group(2),
52 |             cast=0 if wts.group(4) is None else wts.group(4),
53 |             recast=0 if wts.group(6) is None else wts.group(6),
54 |             reply=0 if wts.group(8) is None else wts.group(8),
55 |         )
56 | 


--------------------------------------------------------------------------------
/serve/app/routers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/app/routers/__init__.py


--------------------------------------------------------------------------------
/serve/app/routers/token_router.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Sequence
 2 | from typing import Self
 3 | 
 4 | from asyncpg import Pool
 5 | from eth_typing import ChecksumAddress
 6 | from eth_utils import to_bytes, to_checksum_address
 7 | from fastapi import APIRouter, Depends, HTTPException, Path, Query
 8 | from pydantic import BaseModel, ValidationError, field_validator
 9 | 
10 | from ..dependencies import db_pool
11 | from ..dependencies.db_utils import get_token_balances
12 | 
13 | router = APIRouter(prefix="/{token}", tags=["Token"])
14 | 
15 | 
16 | class Token(BaseModel):
17 |     """
18 |     Token address.
19 | 
20 |     TODO(ek) - expand to CAIP-19, to add chain ID and stuff.
21 |     """
22 | 
23 |     address: ChecksumAddress
24 | 
25 |     @field_validator("address", mode="before")
26 |     def ensure_address(cls, v):
27 |         try:
28 |             return to_checksum_address(v)
29 |         except Exception:
30 |             raise ValueError(f"Invalid token address: {v!r}")
31 | 
32 |     @classmethod
33 |     def from_str(cls, v: str) -> Self:
34 |         return cls(address=to_checksum_address(v))
35 | 
36 | 
37 | def get_token(token: str = Path(description="ERC20 token address")) -> Token:
38 |     try:
39 |         return Token.from_str(token)
40 |     except ValidationError as e:
41 |         raise HTTPException(status_code=422, detail=f"Invalid token {token!r}")
42 | 
43 | 
44 | @router.get("/balances")
45 | async def get_balances(
46 |     token: Token = Depends(get_token),
47 |     fids: Sequence[int] = Query(..., alias='fid', min_items=1),
48 |     pool: Pool = Depends(db_pool.get_db),
49 | ):
50 |     rows = await get_token_balances(to_bytes(hexstr=token.address), fids, pool)
51 |     balances = {fid: value for fid, value in rows}
52 |     return {
53 |         "balances": [
54 |             {"fid": fid, "value": str(int(balances.get(fid, 0)))} for fid in fids
55 |         ]
56 |     }
57 | 


--------------------------------------------------------------------------------
/serve/app/routers/user_router.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, List, Optional
 2 | 
 3 | from asyncpg.pool import Pool
 4 | from fastapi import APIRouter, Depends, Header, Query
 5 | from loguru import logger
 6 | 
 7 | from ..dependencies import db_pool, db_utils, graph
 8 | 
 9 | router = APIRouter(tags=["User Labels (Requires API Key)"])
10 | 
11 | 
12 | @router.get("/labels/global/top_casters")
13 | async def get_top_global_casters(
14 |     x_api_key: Optional[str] = Header(None),  # used only for swagger ui
15 |     offset: Annotated[int | None, Query()] = 0,
16 |     limit: Annotated[int | None, Query(le=1000)] = 100,
17 |     pool: Pool = Depends(db_pool.get_db),
18 | ):
19 |     """
20 |     Get the top global casters
21 |     This API takes optional parameters -
22 |     offset and limit
23 |     Parameter 'offset' is used to specify how many results to skip
24 |     and can be useful for paginating through results. \n
25 |     Parameter 'limit' is used to specify the number of results to return. \n
26 |     Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or <a href="https://t.me/Karma3Labs" target=_blank>https://t.me/Karma3Labs</a> to get the trial API key. \n
27 |     """
28 | 
29 |     top_casters = await db_utils.get_top_casters(offset=offset, limit=limit, pool=pool)
30 |     return {"result": top_casters}
31 | 
32 | 
33 | @router.get("/labels/global/top_spammers")
34 | async def get_top_global_spammers(
35 |     x_api_key: Optional[str] = Header(None),  # used only for swagger ui
36 |     offset: Annotated[int | None, Query()] = 0,
37 |     limit: Annotated[int | None, Query(le=1000)] = 100,
38 |     pool: Pool = Depends(db_pool.get_db),
39 | ):
40 |     """
41 |     Get the top global spammers
42 |     This API takes optional parameters -
43 |     offset and limit
44 |     Parameter 'offset' is used to specify how many results to skip
45 |     and can be useful for paginating through results. \n
46 |     Parameter 'limit' is used to specify the number of results to return. \n
47 |     Header 'x-api-key' is used to authenticate the user. Please contact hello@karma3labs.com or <a href="https://t.me/Karma3Labs" target=_blank>https://t.me/Karma3Labs</a> to get the trial API key. \n
48 |     """
49 | 
50 |     top_spammers = await db_utils.get_top_spammers(
51 |         offset=offset, limit=limit, pool=pool
52 |     )
53 |     return {"result": top_spammers}
54 | 


--------------------------------------------------------------------------------
/serve/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   farcaster-graph:
 3 |     build: .
 4 |     container_name: farcaster-graph
 5 |     image: farcaster-graph:latest
 6 |     volumes:
 7 |       - /home/ubuntu/serve_files:/tmp
 8 |     environment:
 9 |       PORT: 8000
10 |     ports:
11 |       - '8000:8000'
12 |     deploy:
13 |       resources:
14 |         limits:
15 |           memory: 64G
16 |     restart: unless-stopped
17 |     extra_hosts:
18 |       - "host.docker.internal:host-gateway"
19 |     networks:
20 |       - farcaster-network
21 | 
22 | networks:
23 |   farcaster-network:
24 |     name: farcaster-network
25 |     external: true


--------------------------------------------------------------------------------
/serve/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "serve"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Vijay Mariadassou <vijay@karma3labs.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.12"
10 | fastapi = "^0.109.0"
11 | uvicorn = "^0.27.0"
12 | asyncpg = "^0.29.0"
13 | sqlalchemy = "^2.0.25"
14 | loguru = "^0.7.2"
15 | igraph = "^0.11.3"
16 | pydantic-settings = "^2.1.0"
17 | psutil = "^5.9.8"
18 | pandas = {extras = ["performance"], version = "^2.2.2"}
19 | numpy = "^1.26.4"
20 | requests = "^2.31.0"
21 | opentelemetry-distro = "0.43b0"
22 | opentelemetry-instrumentation-fastapi = "0.43b0"
23 | opentelemetry-instrumentation-logging = "0.43b0"
24 | opentelemetry-exporter-otlp = "1.22.0"
25 | prometheus-client = "0.19.0"
26 | asgi-correlation-id = "^4.3.1"
27 | niquests = "^3.14.0"
28 | py-memoize = "^3.1.1"
29 | black = "^25.1.0"
30 | async-lru = "^2.0.5"
31 | isort = "^6.0.1"
32 | eth-typing = "^5.2.1"
33 | eth-utils = "^5.3.0"
34 | eth-hash = {extras = ["pycryptodome"], version = "^0.7.1"}
35 | cashews = {extras = ["diskcache"], version = "^7.4.0"}
36 | 
37 | [build-system]
38 | requires = ["poetry-core"]
39 | build-backend = "poetry.core.masonry.api"
40 | 
41 | [project]
42 | name = "serve"
43 | version = "0.1.0"
44 | requires-python = ">=3.12"
45 | 
46 | [tool.black]
47 | skip-string-normalization = true
48 | 


--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_SUCCESS


--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_df.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_90dv3_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_90dv3_fid_ig.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_SUCCESS


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_df.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_SUCCESS


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_df.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_fid_ig.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_idx.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_idx.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_engagement_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_engagement_ig.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_following_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_SUCCESS


--------------------------------------------------------------------------------
/serve/samples/fc_following_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_df.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_SUCCESS


--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_df.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_following_fid_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_fid_ig.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_following_idx.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_idx.pkl


--------------------------------------------------------------------------------
/serve/samples/fc_following_ig.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/fc_following_ig.pkl


--------------------------------------------------------------------------------
/serve/samples/personal_graph.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/samples/personal_graph.parquet


--------------------------------------------------------------------------------
/serve/scripts/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | unset -v tooldir opt
 3 | OPTIND=1
 4 | while getopts :b: opt
 5 | do
 6 | 	case "${opt}" in
 7 | 		'?') echo "unrecognized option -${OPTARG}" >&2; exit 64;;
 8 | 		':') echo "missing argument for -${OPTARG}" >&2; exit 64;;
 9 | 		b) tooldir="${OPTARG}";;
10 | 		*) echo "unhandled option -${opt}" >&2; exit 70;;
11 | 	esac
12 | done
13 | shift $((OPTIND - 1))
14 | case "${tooldir+set}" in
15 | 	set) PATH="${tooldir}${PATH+":${PATH}"}"; export PATH;;
16 | esac
17 | case $# in
18 |         0)
19 |                 set -- .
20 |                 ;;
21 | esac
22 | isort --profile=black "$@" || exit
23 | black --quiet "$@" || exit
24 | #autopep8 --in-place --aggressive --aggressive --recursive "$@" || exit
25 | 


--------------------------------------------------------------------------------
/serve/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Karma3Labs/farcaster-graph/6742d8a9024f0cf18138228c009e30926e0abfed/serve/static/favicon.png


--------------------------------------------------------------------------------
/sql/counts_by_day.sql:
--------------------------------------------------------------------------------
 1 | WITH casts_counts AS (
 2 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS casts_count
 3 |   FROM casts
 4 |   GROUP BY DATE_TRUNC('day', timestamp)
 5 | ),
 6 | links_counts AS (
 7 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS links_count
 8 |   FROM links
 9 |   GROUP BY DATE_TRUNC('day', timestamp)
10 | ),
11 | messages_counts AS (
12 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS messages_count
13 |   FROM messages
14 |   GROUP BY DATE_TRUNC('day', timestamp)
15 | ),
16 | reactions_counts AS (
17 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS reactions_count
18 |   FROM reactions
19 |   GROUP BY DATE_TRUNC('day', timestamp)
20 | ),
21 | user_data_counts AS (
22 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS user_data_count
23 |   FROM user_data
24 |   GROUP BY DATE_TRUNC('day', timestamp)
25 | ),
26 | verifications_counts AS (
27 |   SELECT DATE_TRUNC('day', timestamp) AS day, COUNT(*) AS verifications_count
28 |   FROM verifications
29 |   GROUP BY DATE_TRUNC('day', timestamp)
30 | )
31 | SELECT 
32 |   COALESCE(casts.day, links.day, messages.day, reactions.day, user_data.day, verifications.day) AS day,
33 |   COALESCE(casts_count, 0) AS casts_count,
34 |   COALESCE(links_count, 0) AS links_count,
35 |   COALESCE(messages_count, 0) AS messages_count,
36 |   COALESCE(reactions_count, 0) AS reactions_count,
37 |   COALESCE(user_data_count, 0) AS user_data_count,
38 |   COALESCE(verifications_count, 0) AS verifications_count
39 | FROM casts_counts casts
40 | FULL OUTER JOIN links_counts links ON casts.day = links.day
41 | FULL OUTER JOIN reactions_counts reactions ON COALESCE(casts.day, links.day) = reactions.day
42 | FULL OUTER JOIN verifications_counts verifications ON COALESCE(casts.day, links.day, reactions.day) = verifications.day
43 | FULL OUTER JOIN messages_counts messages ON COALESCE(casts.day, links.day, reactions.day, verifications.day) = messages.day
44 | FULL OUTER JOIN user_data_counts user_data ON COALESCE(casts.day, links.day, reactions.day, verifications.day, messages.day) = user_data.day
45 | ORDER BY day DESC
46 | LIMIT 1000;
47 | 


--------------------------------------------------------------------------------
/sql/counts_by_table.sql:
--------------------------------------------------------------------------------
 1 | WITH 
 2 |     q_casts AS (SELECT COUNT(1) AS casts FROM casts),
 3 |     q_chain_events AS (SELECT COUNT(1) AS chain_events FROM chain_events),
 4 |     q_fids AS (SELECT COUNT(1) AS fids FROM fids),
 5 |     q_fnames AS (SELECT COUNT(1) AS fnames FROM fnames),
 6 |     q_links AS (SELECT COUNT(1) AS links FROM links),
 7 |     q_messages AS (SELECT COUNT(1) AS messages FROM messages),
 8 |     q_reactions AS (SELECT COUNT(1) AS reactions FROM reactions),
 9 |     q_signers AS (SELECT COUNT(1) AS signers FROM signers),
10 |     q_storage_alloc AS (SELECT COUNT(1) AS storage_alloc FROM storage_allocations),
11 |     q_user_data AS (SELECT COUNT(1) AS user_data FROM user_data),
12 |     q_username_proofs AS (SELECT COUNT(1) AS username_proofs FROM username_proofs),
13 |     q_verifications AS (SELECT COUNT(1) AS verifications FROM verifications)
14 |     
15 | SELECT 
16 |     q_casts.casts, 
17 | 	q_chain_events.chain_events,
18 |     q_fids.fids, 
19 |     q_fnames.fnames, 
20 |     q_links.links, 
21 |     q_messages.messages,
22 |     q_reactions.reactions, 
23 | 	q_signers.signers,
24 | 	q_storage_alloc.storage_alloc,
25 | 	q_user_data.user_data,
26 | 	q_username_proofs.username_proofs,
27 | 	q_verifications.verifications
28 | FROM
29 | 	q_casts, 
30 | 	q_chain_events,
31 |     q_fids, 
32 | 	q_fnames, 
33 | 	q_links, 
34 | 	q_messages, 
35 | 	q_reactions, 
36 | 	q_signers, 
37 | 	q_storage_alloc, 
38 | 	q_user_data, 
39 | 	q_username_proofs, 
40 | 	q_verifications; 
41 | 


--------------------------------------------------------------------------------
/sql/neynar-replica/.env.sample:
--------------------------------------------------------------------------------
 1 | POSTGRES_HOST=127.0.0.1
 2 | POSTGRES_PORT=9541
 3 | POSTGRES_USER=postgres
 4 | POSTGRES_NAME=postgres
 5 | POSTGRES_PASSWORD=CHANGEME
 6 | PRIMARY_HOST=135.181.236.185
 7 | PRIMARY_PORT=9541
 8 | PRIMARY_USER=replica_user
 9 | PRIMARY_PASSWORD=CHANGEME
10 | PRIMARY_SLOT_NAME=eigen10
11 | PGDATA=/var/lib/postgresql/data
12 | GID=999
13 | UID=
14 | HOST_VOLUME=/data/pgdata


--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile:
--------------------------------------------------------------------------------
1 | Dockerfile.noble


--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile.alpine:
--------------------------------------------------------------------------------
 1 | FROM postgres:17.2-alpine
 2 | 
 3 | # Install sudo and configure it for passwordless operation
 4 | RUN apk add --no-cache sudo && \
 5 |     echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres
 6 | 
 7 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh
 8 | RUN chmod +x /usr/local/bin/entrypoint.sh
 9 | 
10 | # Set the entrypoint script
11 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
12 | 


--------------------------------------------------------------------------------
/sql/neynar-replica/Dockerfile.noble:
--------------------------------------------------------------------------------
 1 | # Use a base image with glibc 2.39
 2 | FROM ubuntu:noble
 3 | 
 4 | # Install necessary packages
 5 | RUN apt-get update && \
 6 |     apt-get install -y sudo curl gnupg lsb-release && \
 7 |     apt-get clean && \
 8 |     rm -rf /var/lib/apt/lists/*;
 9 | 
10 | # Install locales
11 | RUN apt-get update && \
12 | 	apt-get install -y --no-install-recommends locales && \
13 | 	rm -rf /var/lib/apt/lists/*;
14 | RUN	echo 'en_US.UTF-8 UTF-8' >> /etc/locale.gen; \
15 | 	locale-gen; \
16 | 	locale -a | grep 'en_US.utf8'
17 | 
18 | # Add PostgreSQL repository
19 | RUN sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' && \
20 |     curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
21 | 
22 | ARG GID
23 | ARG UID
24 | # Force postgres installation to use uid=999 and gid=999
25 | RUN set -eux; \
26 | 	groupadd -r postgres --gid=${GID}; \
27 | 	useradd -r -g postgres --uid=${UID} --home-dir=/var/lib/postgresql --shell=/bin/bash postgres;
28 | 
29 | # Install PostgreSQL
30 | RUN apt-get update
31 | RUN apt-get install -y postgresql-17
32 | RUN apt-get clean && \
33 |     rm -rf /var/lib/apt/lists/*
34 | 
35 | # Set up sudo for postgres user
36 | RUN echo "postgres ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/postgres && \
37 |     chmod 440 /etc/sudoers.d/postgres
38 | 
39 | ENV PG_MAJOR=17
40 | ENV PATH=$PATH:/usr/lib/postgresql/$PG_MAJOR/bin
41 | 
42 | RUN echo 'Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/17/bin"' \
43 |     >> /etc/sudoers.d/postgres
44 | 
45 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh
46 | RUN chmod +x /usr/local/bin/entrypoint.sh
47 | 
48 | # Set the entrypoint script
49 | ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
50 | 


--------------------------------------------------------------------------------
/sql/neynar-replica/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   postgres:
 3 |     restart: unless-stopped
 4 |     container_name: eigen8-replica-postgres
 5 |     shm_size: '32gb'
 6 |     build:
 7 |       context: .
 8 |       args:
 9 |         GID: ${GID}
10 |         UID: ${UID}
11 |     ports:
12 |       - '${POSTGRES_PORT}:5432'
13 |     environment:
14 |       POSTGRES_DB: ${POSTGRES_NAME}
15 |       POSTGRES_USER: ${POSTGRES_USER}
16 |       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
17 |       PRIMARY_HOST: ${PRIMARY_HOST}
18 |       PRIMARY_PORT: ${PRIMARY_PORT}
19 |       PRIMARY_USER: ${PRIMARY_USER}
20 |       PRIMARY_PASSWORD: ${PRIMARY_PASSWORD}
21 |       PRIMARY_SLOT_NAME: ${PRIMARY_SLOT_NAME}
22 |       PGDATA: ${PGDATA}
23 |     volumes:
24 |       - ${HOST_VOLUME}:/var/lib/postgresql/data
25 |       - ${PWD}/postgresql.conf:/usr/local/bin/postgresql.conf
26 |       - ${PWD}/pg_hba.conf:/usr/local/bin/pg_hba.conf
27 |     healthcheck:
28 |       test: ['CMD-SHELL', 'pg_isready --dbname=${POSTGRES_NAME} -U ${PRIMARY_USER}']
29 |       interval: 10s
30 |       timeout: 10s
31 |       retries: 3
32 |     networks:
33 |       - farcaster-network
34 | 
35 | networks:
36 |   farcaster-network:
37 |     external: true
38 |     name: farcaster-network
39 | 
40 | volumes:
41 |   postgres-data:
42 |     name: neynar-replica
43 | 


--------------------------------------------------------------------------------
/sql/neynar-replica/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Directory where the data will be stored
 5 | DATA_DIR="/var/lib/postgresql/data"
 6 | CONF_DIR="/var/lib/postgresql/conf"
 7 | 
 8 | # Ensure environment variables are set
 9 | if [ -z "$PRIMARY_HOST" ] || [ -z "$PRIMARY_PORT" ] || [ -z "$PRIMARY_USER" ] || [ -z "$PRIMARY_PASSWORD" ] || [ -z "$PRIMARY_SLOT_NAME" ]; then
10 |     echo "Error: Environment variables not set correctly."
11 |     exit 1
12 | fi
13 | 
14 | # Prepare configuration directory (outside of data directory)
15 | mkdir -p $CONF_DIR
16 | cp /usr/local/bin/postgresql.conf $CONF_DIR/postgresql.conf
17 | cp /usr/local/bin/pg_hba.conf $CONF_DIR/pg_hba.conf
18 | 
19 | # Check if the data directory is empty
20 | if [ "$(ls -A $DATA_DIR)" ]; then
21 |     echo "Data directory is not empty."
22 | else
23 |     echo "Data directory is empty, setting up .pgpass file..."
24 |     echo "$PRIMARY_HOST:$PRIMARY_PORT:*:$PRIMARY_USER:$PRIMARY_PASSWORD" > /root/.pgpass
25 |     chmod 600 /root/.pgpass
26 | 
27 |     echo "Initiating base backup..."
28 |     pg_config --version
29 |     pg_basebackup -h $PRIMARY_HOST -p $PRIMARY_PORT -D $DATA_DIR -U $PRIMARY_USER -vP -w -Xs -R -S $PRIMARY_SLOT_NAME
30 | 
31 |     # Set the correct permissions
32 |     chmod 0700 $DATA_DIR
33 |     chown -R postgres:postgres $DATA_DIR
34 |     
35 |     # Move the customized postgresql.conf back to the data directory
36 |     mv $CONF_DIR/postgresql.conf $DATA_DIR/postgresql.conf
37 |     mv $CONF_DIR/pg_hba.conf $DATA_DIR/pg_hba.conf
38 | 
39 |     echo "Backup and configuration complete. Starting PostgreSQL in standby mode."
40 | fi
41 | 
42 | 
43 | # Start PostgreSQL using sudo
44 | exec sudo -u postgres postgres -D $DATA_DIR
45 | 


--------------------------------------------------------------------------------
/sql/neynar-replica/postgresql.conf:
--------------------------------------------------------------------------------
 1 | listen_addresses = '*'	# what IP address(es) to listen on;
 2 | port = 5432				# (change requires restart)
 3 | max_connections = 400	# (change requires restart)
 4 | shared_buffers = 8GB	# min 128kB
 5 | work_mem = 64MB	# min 64kB
 6 | maintenance_work_mem = 1GB	# min 64kB
 7 | dynamic_shared_memory_type = posix	# the default is usually the first option
 8 | max_worker_processes = 16	# (change requires restart)
 9 | wal_level = replica	# minimal, replica, or logical
10 | synchronous_commit = local	# synchronization level;
11 | wal_log_hints = on	# also do full page writes of non-critical updates
12 | wal_compression = on	# enables compression of full-page writes;
13 | checkpoint_timeout = 60min	# range 30s-1d
14 | max_wal_size = 16GB	
15 | min_wal_size = 80MB
16 | max_wal_senders = 10	# max number of walsender processes
17 | hot_standby = on	# "off" disallows queries during recovery
18 | wal_receiver_timeout = 5min	# time that receiver waits for
19 | random_page_cost = 1.1	# same scale as above
20 | effective_cache_size = 16GB	
21 | log_line_prefix = '%m [%p] %q%u@%d '		# special values:
22 | log_timezone = UTC
23 | cluster_name = '17/main'			# added to process titles if nonempty
24 | default_transaction_read_only = on	
25 | datestyle = 'iso, mdy'
26 | timezone = UTC
27 | shared_preload_libraries = 'pg_stat_statements'	# (change requires restart)
28 | 


--------------------------------------------------------------------------------
/sql/replicator_drop_fk.sql:
--------------------------------------------------------------------------------
 1 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_hash_foreign;
 2 | ALTER TABLE IF EXISTS ONLY public.verifications DROP CONSTRAINT IF EXISTS verifications_fid_foreign;
 3 | ALTER TABLE IF EXISTS ONLY public.username_proofs DROP CONSTRAINT IF EXISTS username_proofs_fid_foreign;
 4 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_hash_foreign;
 5 | ALTER TABLE IF EXISTS ONLY public.user_data DROP CONSTRAINT IF EXISTS user_data_fid_foreign;
 6 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_requester_fid_foreign;
 7 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_remove_chain_event_id_foreign;
 8 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_fid_foreign;
 9 | ALTER TABLE IF EXISTS ONLY public.signers DROP CONSTRAINT IF EXISTS signers_add_chain_event_id_foreign;
10 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_target_hash_foreign;
11 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_hash_foreign;
12 | ALTER TABLE IF EXISTS ONLY public.reactions DROP CONSTRAINT IF EXISTS reactions_fid_foreign;
13 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_signer_fid_foreign;
14 | ALTER TABLE IF EXISTS ONLY public.messages DROP CONSTRAINT IF EXISTS messages_fid_foreign;
15 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_target_fid_foreign;
16 | ALTER TABLE IF EXISTS ONLY public.links DROP CONSTRAINT IF EXISTS links_fid_foreign;
17 | ALTER TABLE IF EXISTS ONLY public.fnames DROP CONSTRAINT IF EXISTS fnames_fid_foreign;
18 | ALTER TABLE IF EXISTS ONLY public.storage_allocations DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign;
19 | ALTER TABLE IF EXISTS ONLY public.fids DROP CONSTRAINT IF EXISTS fids_chain_event_id_foreign;
20 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_hash_foreign;
21 | ALTER TABLE IF EXISTS ONLY public.casts DROP CONSTRAINT IF EXISTS casts_fid_foreign;
22 | 


--------------------------------------------------------------------------------